Added asterix project
git-svn-id: https://asterixdb.googlecode.com/svn/trunk/asterix@12 eaa15691-b419-025a-1212-ee371bd00084
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_1.aql
new file mode 100644
index 0000000..4409fec
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_1.aql
@@ -0,0 +1,32 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-1_1.adm';
+
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.1.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.1.1.aql
new file mode 100644
index 0000000..8deab14
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.1.1.aql
@@ -0,0 +1,34 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-1_2.1.1.adm';
+
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.1.aql
new file mode 100644
index 0000000..8a492ae
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.1.aql
@@ -0,0 +1,34 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-1_2.1.adm';
+
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.aql
new file mode 100644
index 0000000..68d26fe
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-1_2.aql
@@ -0,0 +1,33 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-1_2.adm';
+
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $paperid := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paperid
+ order by count($paperid), $tokenGroupped
+ return $tokenGroupped
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2.1_5.3.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2.1_5.3.1.aql
new file mode 100644
index 0000000..f499ba3
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2.1_5.3.1.aql
@@ -0,0 +1,54 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2.1_5.3.1.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = /*+ bcast*/ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard($lenDBLP, .5f))
+ order by $idDBLP
+ return {'id': $idDBLP, 'prefixToken': $prefixTokenDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2.2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2.2.aql
new file mode 100644
index 0000000..ce8dae4
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2.2.aql
@@ -0,0 +1,53 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type TOKENSRANKEDADMType as closed {
+ token: int32,
+ rank: int32
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset TOKENSRANKEDADM(TOKENSRANKEDADMType) partitioned by key rank on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+load dataset TOKENSRANKEDADM
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/tokensranked.adm"),("format"="adm"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2.2.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked in dataset('TOKENSRANKEDADM')
+ where $tokenUnranked = /*+ bcast*/ $tokenRanked.token
+ order by $tokenRanked.rank
+ return $tokenRanked.rank
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+ order by $idDBLP, $prefixTokenDBLP
+ return {'id': $idDBLP, 'prefixToken': $prefixTokenDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_1.aql
new file mode 100644
index 0000000..7328e60
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_1.aql
@@ -0,0 +1,44 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_1.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ order by $paperDBLP.id
+ return {'id': $paperDBLP.id, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_2.aql
new file mode 100644
index 0000000..5c5cc8e8
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_2.aql
@@ -0,0 +1,45 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_2.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ order by $paperDBLP.id
+ return {'id': $paperDBLP.id, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_3.aql
new file mode 100644
index 0000000..6ffb2d5
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_3.aql
@@ -0,0 +1,46 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_3.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ order by $idDBLP
+ return {'id': $idDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_4.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_4.aql
new file mode 100644
index 0000000..6a0a011
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_4.aql
@@ -0,0 +1,47 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_4.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ order by $idDBLP
+ return {'id': $idDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.1.aql
new file mode 100644
index 0000000..7d3e144
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.1.aql
@@ -0,0 +1,49 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_5.1.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ order by $idDBLP
+ return {'id': $idDBLP, 'len': $lenDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.2.aql
new file mode 100644
index 0000000..f69e98f
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.2.aql
@@ -0,0 +1,49 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_5.2.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast*/ $tokenRanked
+ order by $i
+ return $i
+ order by $idDBLP
+ return {'id': $idDBLP, 'len': $lenDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.3.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.3.1.aql
new file mode 100644
index 0000000..ec9726d
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.3.1.aql
@@ -0,0 +1,50 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_5.3.1.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast*/ $tokenRanked
+ order by $i
+ return $i
+ order by $idDBLP
+ return {'id': $idDBLP, 'len': $lenDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.3.aql
new file mode 100644
index 0000000..a0b3011
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.3.aql
@@ -0,0 +1,50 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_5.3.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast*/ $tokenRanked
+ order by $i
+ return $i
+ order by $idDBLP
+ return {'id': $idDBLP, 'len': $lenDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.aql
new file mode 100644
index 0000000..6c5e1ac
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-2_5.aql
@@ -0,0 +1,48 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-2_5.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ order by $idDBLP
+ return {'id': $idDBLP, 'len': $lenDBLP, 'tokens': $tokensDBLP}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.1.aql
new file mode 100644
index 0000000..e871290
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.1.aql
@@ -0,0 +1,94 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-3_1.1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperLeft in dataset('DBLP')
+ let $lenLeft := len(counthashed-word-tokens($paperLeft.title))
+ let $tokensLeft :=
+ for $tokenUnranked in counthashed-word-tokens($paperLeft.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenLeft in subset-collection(
+ $tokensLeft,
+ 0,
+ prefix-len-jaccard($lenLeft, .5f))
+
+ for $paperRight in dataset('DBLP')
+ let $lenRight := len(counthashed-word-tokens($paperRight.title))
+ let $tokensRight :=
+ for $tokenUnranked in counthashed-word-tokens($paperRight.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenRight in subset-collection(
+ $tokensRight,
+ 0,
+ prefix-len-jaccard($lenRight, .5f))
+
+ where $prefixTokenLeft = $prefixTokenRight
+
+ let $sim := similarity-jaccard-prefix(
+ $lenLeft,
+ $tokensLeft,
+ $lenRight,
+ $tokensRight,
+ $prefixTokenLeft,
+ .5f)
+ where $sim >= .5f and $paperLeft.id < $paperRight.id
+ /*+ hash */
+ group by $idLeft := $paperLeft.id, $idRight := $paperRight.id with $sim
+ return {'idLeft': $idLeft, 'idRight': $idRight, 'sim': $sim[0]}
+
+for $paperLeft in dataset('DBLP')
+for $paperRight in dataset('DBLP')
+where $ridpair.idLeft = $paperLeft.id and $ridpair.idRight = $paperRight.id
+order by $paperLeft.id, $paperRight.id
+return {'left': $paperLeft, 'right': $paperRight, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.2.aql
new file mode 100644
index 0000000..d1d65dc
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.2.aql
@@ -0,0 +1,94 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-3_1.2.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperLeft in dataset('DBLP')
+ let $lenLeft := len(counthashed-word-tokens($paperLeft.title))
+ let $tokensLeft :=
+ for $tokenUnranked in counthashed-word-tokens($paperLeft.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenLeft in subset-collection(
+ $tokensLeft,
+ 0,
+ prefix-len-jaccard($lenLeft, .5f))
+
+ for $paperRight in dataset('DBLP')
+ let $lenRight := len(counthashed-word-tokens($paperRight.title))
+ let $tokensRight :=
+ for $tokenUnranked in counthashed-word-tokens($paperRight.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenRight in subset-collection(
+ $tokensRight,
+ 0,
+ prefix-len-jaccard($lenRight, .5f))
+
+ where $prefixTokenLeft = $prefixTokenRight
+
+ let $sim := similarity-jaccard-prefix(
+ $lenLeft,
+ $tokensLeft,
+ $lenRight,
+ $tokensRight,
+ $prefixTokenLeft,
+ .5f)
+ where $sim >= .5f and $paperLeft.id < $paperRight.id
+ /*+ hash */
+ group by $idLeft := $paperLeft.id, $idRight := $paperRight.id with $sim
+ return {'idLeft': $idLeft, 'idRight': $idRight, 'sim': $sim[0]}
+
+for $paperLeft in dataset('DBLP')
+for $paperRight in dataset('DBLP')
+where $ridpair.idLeft = $paperLeft.id and $ridpair.idRight = $paperRight.id
+order by $paperLeft.id, $paperRight.id
+return {'left': $paperLeft, 'right': $paperRight, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.aql
new file mode 100644
index 0000000..eea456e
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-3_1.aql
@@ -0,0 +1,91 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-3_1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $paperLeft in dataset('DBLP')
+for $paperRight in dataset('DBLP')
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperLeft in dataset('DBLP')
+ let $lenLeft := len(counthashed-word-tokens($paperLeft.title))
+ let $tokensLeft :=
+ for $tokenUnranked in counthashed-word-tokens($paperLeft.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenLeft in subset-collection(
+ $tokensLeft,
+ 0,
+ prefix-len-jaccard($lenLeft, .5f))
+
+ for $paperRight in dataset('DBLP')
+ let $lenRight := len(counthashed-word-tokens($paperRight.title))
+ let $tokensRight :=
+ for $tokenUnranked in counthashed-word-tokens($paperRight.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenRight in subset-collection(
+ $tokensRight,
+ 0,
+ prefix-len-jaccard($lenRight, .5f))
+
+ where $prefixTokenLeft = $prefixTokenRight
+
+ let $sim := similarity-jaccard-prefix(
+ $lenLeft,
+ $tokensLeft,
+ $lenRight,
+ $tokensRight,
+ $prefixTokenLeft,
+ .5f)
+ where $sim >= .5f and $paperLeft.id < $paperRight.id
+ group by $idLeft := $paperLeft.id, $idRight := $paperRight.id with $sim
+ return {'idLeft': $idLeft, 'idRight': $idRight, 'sim': $sim[0]}
+
+where $ridpair.idLeft = $paperLeft.id and $ridpair.idRight = $paperRight.id
+order by $paperLeft.id, $paperRight.id
+return {'left': $paperLeft, 'right': $paperRight, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-aqlplus_1.aql
new file mode 100644
index 0000000..089fc47
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-aqlplus_1.aql
@@ -0,0 +1,29 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP from nc1:'data/pub-small/dblp-small-id.txt' delimited by ':';
+
+write output to nc1:'rttest/fuzzyjoin_dblp-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $dblp in dataset('DBLP')
+for $dblp2 in dataset('DBLP')
+where $dblp.title ~= $dblp2.title and $dblp.id < $dblp2.id
+order by $dblp.id, $dblp2.id
+return {'dblp': $dblp, 'dblp2': $dblp2}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_1.aql
new file mode 100644
index 0000000..a5bb9cd
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_1.aql
@@ -0,0 +1,93 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_1.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $tokensCSX :=
+ for $tokenUnranked in counthashed-word-tokens($paperCSX.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len(counthashed-word-tokens($paperDBLP.title)),
+ $tokensDBLP,
+ len(counthashed-word-tokens($paperCSX.title)),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $paperDBLP.id, $idCSX := $paperCSX.id, $sim := $sim with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_2.aql
new file mode 100644
index 0000000..fab6877
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_2.aql
@@ -0,0 +1,95 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_2.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $tokensCSX :=
+ for $tokenUnranked in counthashed-word-tokens($paperCSX.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len(counthashed-word-tokens($paperDBLP.title)),
+ $tokensDBLP,
+ len(counthashed-word-tokens($paperCSX.title)),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $paperDBLP.id, $idCSX := $paperCSX.id, $sim := $sim with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_3.aql
new file mode 100644
index 0000000..34654e3
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_3.aql
@@ -0,0 +1,98 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_3.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensCSX :=
+ for $tokenUnranked in counthashed-word-tokens($paperCSX.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len(counthashed-word-tokens($paperDBLP.title)),
+ $tokensDBLP,
+ len(counthashed-word-tokens($paperCSX.title)),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_4.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_4.aql
new file mode 100644
index 0000000..f2ad20d
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_4.aql
@@ -0,0 +1,99 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_4.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len($tokensUnrankedDBLP),
+ $tokensDBLP,
+ len($tokensUnrankedCSX),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.1.aql
new file mode 100644
index 0000000..c5d4b73
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.1.aql
@@ -0,0 +1,103 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_5.1.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2.aql
new file mode 100644
index 0000000..b272e34
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2.aql
@@ -0,0 +1,104 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_5.2.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.1.aql
new file mode 100644
index 0000000..fd58328
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.1.aql
@@ -0,0 +1,107 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as closed {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_5.3.1.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.aql
new file mode 100644
index 0000000..a52183f
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.aql
@@ -0,0 +1,106 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_5.3.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.aql
new file mode 100644
index 0000000..7ccdc0c
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.aql
@@ -0,0 +1,101 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-2_5.adm';
+
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ order by $idDBLP, $idCSX
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_1.aql
new file mode 100644
index 0000000..b22d355
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_1.aql
@@ -0,0 +1,103 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $tokensCSX :=
+ for $tokenUnranked in counthashed-word-tokens($paperCSX.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len(counthashed-word-tokens($paperDBLP.title)),
+ $tokensDBLP,
+ len(counthashed-word-tokens($paperCSX.title)),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $paperDBLP.id, $idCSX := $paperCSX.id, $sim := $sim with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_2.aql
new file mode 100644
index 0000000..5dc2cad
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_2.aql
@@ -0,0 +1,104 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_2.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $tokensCSX :=
+ for $tokenUnranked in counthashed-word-tokens($paperCSX.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len(counthashed-word-tokens($paperDBLP.title)),
+ $tokensDBLP,
+ len(counthashed-word-tokens($paperCSX.title)),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $paperDBLP.id, $idCSX := $paperCSX.id, $sim := $sim with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_3.aql
new file mode 100644
index 0000000..0ce00ca
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_3.aql
@@ -0,0 +1,106 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_3.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensDBLP :=
+ for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensCSX :=
+ for $tokenUnranked in counthashed-word-tokens($paperCSX.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len(counthashed-word-tokens($paperDBLP.title)),
+ $tokensDBLP,
+ len(counthashed-word-tokens($paperCSX.title)),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_4.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_4.aql
new file mode 100644
index 0000000..ac683c8
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_4.aql
@@ -0,0 +1,108 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_4.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ len($tokensUnrankedDBLP),
+ $tokensDBLP,
+ len($tokensUnrankedCSX),
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.1.aql
new file mode 100644
index 0000000..fd6ad95
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.1.aql
@@ -0,0 +1,112 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_5.1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.2.aql
new file mode 100644
index 0000000..cc49092
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.2.aql
@@ -0,0 +1,113 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_5.2.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.1.aql
new file mode 100644
index 0000000..66fb57e
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.1.aql
@@ -0,0 +1,115 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_5.3.1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.aql
new file mode 100644
index 0000000..93545b3
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.aql
@@ -0,0 +1,115 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_5.3.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGroupped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.1.aql
new file mode 100644
index 0000000..9782508
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.1.aql
@@ -0,0 +1,119 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_5.4.1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $paperCSX in dataset('CSX')
+for $paperDBLPridpair in
+for $paperDBLP in dataset('DBLP')
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+where $ridpair.idDBLP = $paperDBLP.id
+return {'idDBLP': $paperDBLP.id, 'idCSX': $ridpair.idCSX, 'paperDBLP': $paperDBLP, 'sim': $ridpair.sim}
+
+where $paperDBLPridpair.idCSX = $paperCSX.id
+order by $paperDBLPridpair.idDBLP, $paperDBLPridpair.idCSX
+return {'dblp': $paperDBLPridpair.paperDBLP, 'csx': $paperCSX, 'sim': $paperDBLPridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.aql
new file mode 100644
index 0000000..cb5b8ba
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.aql
@@ -0,0 +1,120 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_5.4.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $paperCSX in dataset('CSX')
+for $paperDBLPridpair in
+for $paperDBLP in dataset('DBLP')
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+where $ridpair.idDBLP = $paperDBLP.id
+return {'idDBLP': $paperDBLP.id, 'paperDBLP': $paperDBLP, 'idCSX': $ridpair.idCSX, 'sim': $ridpair.sim}
+
+where $paperDBLPridpair.idCSX = $paperCSX.id
+// order by $paperDBLPridpair.idDBLP, $paperDBLPridpair.idCSX
+order by $paperDBLPridpair.paperDBLP.id, $paperDBLPridpair.idCSX
+return {'dblp': $paperDBLPridpair.paperDBLP, 'csx': $paperCSX, 'sim': $paperDBLPridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.aql
new file mode 100644
index 0000000..0f1f2fe
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.aql
@@ -0,0 +1,111 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-3_5.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperDBLP in dataset('DBLP')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection(
+ $tokensDBLP,
+ 0,
+ prefix-len-jaccard(len($tokensDBLP), .5f))
+
+ for $paperCSX in dataset('CSX')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ let $id := $paper.id
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $id
+ order by count($id), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection(
+ $tokensCSX,
+ 0,
+ prefix-len-jaccard(len($tokensCSX), .5f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ let $sim := similarity-jaccard-prefix(
+ $lenDBLP,
+ $tokensDBLP,
+ $lenCSX,
+ $tokensCSX,
+ $prefixTokenDBLP,
+ .5f)
+ where $sim >= .5f
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}
+
+for $paperDBLP in dataset('DBLP')
+for $paperCSX in dataset('CSX')
+where $ridpair.idDBLP = $paperDBLP.id and $ridpair.idCSX = $paperCSX.id
+order by $paperDBLP.id, $paperCSX.id
+return {'dblp': $paperDBLP, 'csx': $paperCSX, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_1.aql
new file mode 100644
index 0000000..ead8e81
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_1.aql
@@ -0,0 +1,39 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP from nc1:'data/pub-small/dblp-small-id.txt' delimited by ':';
+load dataset CSX from nc1:'data/pub-small/csx-small-id.txt' delimited by ':';
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $i in dataset('DBLP')
+for $j in dataset('CSX')
+where $i.title ~= $j.title
+order by $i.id, $j.id
+return {'dblp': $i, 'csx': $j}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_2.aql
new file mode 100644
index 0000000..611fd29
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_2.aql
@@ -0,0 +1,39 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP from nc1:'data/pub-small/dblp-small-id.txt' delimited by ':';
+load dataset CSX from nc1:'data/pub-small/csx-small-id.txt' delimited by ':';
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-aqlplus_2.adm';
+
+set simthreshold '.5f';
+
+for $csx in dataset('CSX')
+for $dblp in dataset('DBLP')
+where $dblp.title ~= $csx.title
+order by $dblp.id, $csx.id
+return {'dblp': $dblp, 'csx': $csx}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_3.aql
new file mode 100644
index 0000000..e6dc1ed
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_3.aql
@@ -0,0 +1,39 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP from nc1:'data/pub-small/dblp-small-id.txt' delimited by ':';
+load dataset CSX from nc1:'data/pub-small/csx-small-id.txt' delimited by ':';
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-aqlplus_3.adm';
+
+set simthreshold '.5f';
+
+for $dblp in dataset('DBLP')
+for $csx in dataset('CSX')
+where $csx.title ~= $dblp.title
+order by $dblp.id, $csx.id
+return {'dblp': $dblp, 'csx': $csx}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-dblp-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-dblp-aqlplus_1.aql
new file mode 100644
index 0000000..e1b53d3
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-dblp-aqlplus_1.aql
@@ -0,0 +1,40 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+create dataset CSX(CSXType) partitioned by key id on group1;
+
+load dataset DBLP from nc1:'data/pub-small/dblp-small-id.txt' delimited by ':';
+load dataset CSX from nc1:'data/pub-small/csx-small-id.txt' delimited by ':';
+
+write output to nc1:'rttest/fuzzyjoin_dblp-csx-dblp-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $dblp in dataset('DBLP')
+for $csx in dataset('CSX')
+for $dblp2 in dataset('DBLP')
+where $dblp.title ~= $csx.title and $csx.authors ~= $dblp2.authors
+order by $dblp.id, $csx.id, $dblp2.id
+return {'dblp': $dblp, 'csx': $csx, 'dblp2': $dblp2}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-lookup_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-lookup_1.aql
new file mode 100644
index 0000000..a235756
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-lookup_1.aql
@@ -0,0 +1,29 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+
+write output to nc1:'rttest/fuzzyjoin_dblp-lookup_1.adm';
+
+for $paper in dataset('DBLP')
+where $paper.id = 1
+return $paper
+
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-splits-3_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-splits-3_1.aql
new file mode 100644
index 0000000..68800dd
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-splits-3_1.aql
@@ -0,0 +1,166 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create external dataset DBLP(DBLPType)
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+
+write output to nc1:'rttest/fuzzyjoin_dblp-splits-3_1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperLeft in dataset('DBLP')
+ let $lenLeft := len(counthashed-word-tokens($paperLeft.title))
+ let $tokensLeft :=
+ for $tokenUnranked in counthashed-word-tokens($paperLeft.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefix_tokenLeft in subset-collection(
+ $tokensLeft,
+ 0,
+ prefix-len-jaccard(len($tokensLeft), .5f))
+
+ for $paperRight in dataset('DBLP')
+ let $lenRight := len(counthashed-word-tokens($paperRight.title))
+ let $tokensRight :=
+ for $tokenUnranked in counthashed-word-tokens($paperRight.title)
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $token in counthashed-word-tokens($paper.title)
+ group by $tokenGroupped := $token with $paper
+ order by count($paper), $tokenGroupped
+ return $tokenGroupped
+ where $tokenUnranked = $tokenRanked
+ order by $i
+ return $i
+ for $prefix_tokenRight in subset-collection(
+ $tokensRight,
+ 0,
+ prefix-len-jaccard(len($tokensRight), .5f))
+
+ where $prefix_tokenLeft = $prefix_tokenRight
+
+ let $sim := similarity-jaccard-prefix(
+ $lenLeft,
+ $tokensLeft,
+ $lenRight,
+ $tokensRight,
+ $prefix_tokenLeft,
+ .5f)
+ where $sim >= .5f and $paperLeft.id < $paperRight.id
+ group by $idLeft := $paperLeft.id, $idRight := $paperRight.id with $sim
+ return {'idLeft': $idLeft, 'idRight': $idRight, 'sim': $sim[0]}
+
+for $paperLeft in dataset('DBLP')
+for $paperRight in dataset('DBLP')
+where $ridpair.idLeft = $paperLeft.id and $ridpair.idRight = $paperRight.id
+order by $paperLeft.id, $paperRight.id
+return {'left': $paperLeft, 'right': $paperRight, 'sim': $ridpair.sim}
+
+/*
+edu.uci.ics.aqua.common.exceptions.AquaException: Attempting to construct a nested plan with 3 operator descriptors. Currently, nested plans can only consist in linear pipelines of Asterix micro operators.
+ at edu.uci.ics.aqua.algebra.operators.physical.AbstractGroupByPhysicalOperator.buildPipelineWithProjection(AbstractGroupByPhysicalOperator.java:47)
+ at edu.uci.ics.aqua.algebra.operators.physical.AbstractGroupByPhysicalOperator.compileSubplans(AbstractGroupByPhysicalOperator.java:29)
+ at edu.uci.ics.aqua.algebra.operators.physical.PreSortedGroupByPOperator.contributeRuntimeOperator(PreSortedGroupByPOperator.java:133)
+ at edu.uci.ics.aqua.algebra.operators.logical.AbstractLogicalOperator.contributeRuntimeOperator(AbstractLogicalOperator.java:208)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:52)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compileOpRef(PlanCompiler.java:44)
+ at edu.uci.ics.aqua.jobgen.impl.PlanCompiler.compilePlan(PlanCompiler.java:30)
+ at edu.uci.ics.aqua.api.HeuristicCompilerFactoryBuilder$1$1.createJob(HeuristicCompilerFactoryBuilder.java:64)
+ at edu.uci.ics.asterix.api.common.APIFramework.compileQuery(APIFramework.java:323)
+ at edu.uci.ics.asterix.api.java.AsterixJavaClient.compile(AsterixJavaClient.java:71)
+ at edu.uci.ics.asterix.test.runtime.functions.RuntimeFunctionsTest.test(RuntimeFunctionsTest.java:150)
+ at sun.reflect.GeneratedMethodAccessor28.invoke(Unknown Source)
+ at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
+ at java.lang.reflect.Method.invoke(Method.java:597)
+ at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44)
+ at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15)
+ at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41)
+ at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20)
+ at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:76)
+ at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50)
+ at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193)
+ at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52)
+ at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191)
+ at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42)
+ at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184)
+ at org.junit.runners.ParentRunner.run(ParentRunner.java:236)
+ at org.junit.runners.Suite.runChild(Suite.java:128)
+ at org.junit.runners.Suite.runChild(Suite.java:24)
+ at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193)
+ at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52)
+ at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191)
+ at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42)
+ at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184)
+ at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:28)
+ at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:31)
+ at org.junit.runners.ParentRunner.run(ParentRunner.java:236)
+ at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:49)
+ at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
+ at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)
+ at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)
+ at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)
+ at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)
+
+*/
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/events-users-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/events-users-aqlplus_1.aql
new file mode 100644
index 0000000..6093823
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/events-users-aqlplus_1.aql
@@ -0,0 +1,44 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+
+create type AddressType as closed {
+ street: string,
+ city: string,
+ zip: string,
+ latlong: point
+}
+
+create type UserType as open{
+ name: string,
+ interests: <string>,
+ address: AddressType,
+ member_of: <
+ {
+ sig_id: int32,
+ chapter_name: string,
+ member_since: date
+ }
+>
+}
+
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset User(UserType)
+ partitioned by key name on group1;
+load dataset User from nc1:'data/events/tiny/user.adm';
+
+write output to nc1:'rttest/fuzzyjoin_events-users-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('User')
+let $similar_users :=
+ for $similar_user in dataset('User')
+ where $user.interests ~= $similar_user.interests
+ order by $similar_user.name
+ return { "user_name": $similar_user.name }
+order by $user.name
+return { "user_name": $user.name, "similar_users": $similar_users }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_1.aql
new file mode 100644
index 0000000..7e4ab38
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_1.aql
@@ -0,0 +1,26 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-int-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $user2 in dataset('Users')
+where $user.interests ~= $user2.interests and $user.uid < $user2.uid
+order by $user.uid, $user2.uid
+return { 'user': $user, 'user2': $user2 }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_2.aql
new file mode 100644
index 0000000..d1e49a0
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_2.aql
@@ -0,0 +1,26 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-int-aqlplus_2.adm';
+
+set simthreshold '.5f';
+
+for $user2 in dataset('Users')
+for $user in dataset('Users')
+where $user.interests ~= $user2.interests and $user.uid < $user2.uid
+order by $user.uid, $user2.uid
+return { 'user': $user, 'user2': $user2 }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_3.aql
new file mode 100644
index 0000000..bf15bf6
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-int-aqlplus_3.aql
@@ -0,0 +1,26 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-int-aqlplus_3.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $user2 in dataset('Users')
+where $user2.interests ~= $user.interests and $user.uid < $user2.uid
+order by $user.uid, $user2.uid
+return { 'user': $user, 'user2': $user2 }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_1.1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_1.1.aql
new file mode 100644
index 0000000..ccdd07f
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_1.1.aql
@@ -0,0 +1,27 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-lot-aqlplus_1.1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $user2 in dataset('Users')
+where $user.lottery_numbers ~= $user2.lottery_numbers and $user.uid < $user2.uid
+let $sim := similarity-jaccard($user.lottery_numbers, $user2.lottery_numbers)
+order by $sim desc, $user.uid, $user2.uid limit 3
+return { 'user': $user, 'user2': $user2, 'sim': $sim }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_1.aql
new file mode 100644
index 0000000..3b4a6fe
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_1.aql
@@ -0,0 +1,26 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-lot-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $user2 in dataset('Users')
+where $user.lottery_numbers ~= $user2.lottery_numbers and $user.uid < $user2.uid
+order by $user.uid, $user2.uid
+return { 'user': $user, 'user2': $user2 }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_2.aql
new file mode 100644
index 0000000..3e970c7
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_2.aql
@@ -0,0 +1,26 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-lot-aqlplus_2.adm';
+
+set simthreshold '.5f';
+
+for $user2 in dataset('Users')
+for $user in dataset('Users')
+where $user.lottery_numbers ~= $user2.lottery_numbers and $user.uid < $user2.uid
+order by $user.uid, $user2.uid
+return { 'user': $user, 'user2': $user2 }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_3.aql
new file mode 100644
index 0000000..1979ba1
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-lot-aqlplus_3.aql
@@ -0,0 +1,26 @@
+drop dataverse fuzzyjoin if exists;
+create dataverse fuzzyjoin;
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-lot-aqlplus_3.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $user2 in dataset('Users')
+where $user2.lottery_numbers ~= $user.lottery_numbers and $user.uid < $user2.uid
+order by $user.uid, $user2.uid
+return { 'user': $user, 'user2': $user2 }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-3_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-3_1.aql
new file mode 100644
index 0000000..abbb66a
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-3_1.aql
@@ -0,0 +1,103 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: {{string}}
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: {{string}}
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+
+load dataset Users
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/users-visitors-small/users.json"),("format"="adm"));
+
+load dataset Visitors
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/users-visitors-small/visitors.json"),("format"="adm"));
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-int-3_1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $user in dataset('Users')
+ let $lenUser := len($user.interests)
+ let $tokensUser :=
+ for $token in $user.interests
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $user in dataset('Users')
+ for $token in $user.interests
+ group by $tokenGroupped := $token with $user
+ order by count($user), $tokenGroupped
+ return $tokenGroupped
+ where $token = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenUser in subset-collection(
+ $tokensUser,
+ 0,
+ prefix-len-jaccard($lenUser, .5f))
+
+ for $visitor in dataset('Visitors')
+ let $lenVisitor := len($visitor.interests)
+ let $tokensVisitor :=
+ for $token in $visitor.interests
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $user in dataset('Users')
+ for $token in $user.interests
+ group by $tokenGroupped := $token with $user
+ order by count($user), $tokenGroupped
+ return $tokenGroupped
+ where $token = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenVisitor in subset-collection(
+ $tokensVisitor,
+ 0,
+ prefix-len-jaccard($lenVisitor, .5f))
+
+ where $prefixTokenUser = $prefixTokenVisitor
+
+ let $sim := similarity-jaccard-prefix(
+ $lenUser,
+ $tokensUser,
+ $lenVisitor,
+ $tokensVisitor,
+ $prefixTokenUser,
+ .5f)
+ where $sim >= .5f
+ group by $uid := $user.uid, $vid := $visitor.vid with $sim
+ return {'uid': $uid, 'vid': $vid, 'sim': $sim[0]}
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $ridpair.uid = $user.uid and $ridpair.vid = $visitor.vid
+order by $user.uid, $visitor.vid
+return {'user': $user, 'visitor': $visitor, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_1.aql
new file mode 100644
index 0000000..1a676c5
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_1.aql
@@ -0,0 +1,37 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-int-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $user.interests ~= $visitor.interests
+order by $user.uid, $visitor.vid
+return { 'user': $user, 'visitor': $visitor }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_2.aql
new file mode 100644
index 0000000..1e6e417
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_2.aql
@@ -0,0 +1,37 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-int-aqlplus_2.adm';
+
+set simthreshold '.5f';
+
+for $visitor in dataset('Visitors')
+for $user in dataset('Users')
+where $user.interests ~= $visitor.interests
+order by $user.uid, $visitor.vid
+return { 'user': $user, 'visitor': $visitor }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_3.aql
new file mode 100644
index 0000000..2d39bd4
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-aqlplus_3.aql
@@ -0,0 +1,37 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-int-aqlplus_3.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $visitor.interests ~= $user.interests
+order by $user.uid, $visitor.vid
+return { 'user': $user, 'visitor': $visitor }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-vis-user-lot-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-vis-user-lot-aqlplus_1.aql
new file mode 100644
index 0000000..4561a7a
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-int-vis-user-lot-aqlplus_1.aql
@@ -0,0 +1,38 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-int-vis-user-lot-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+for $user2 in dataset('Users')
+where $user.interests ~= $visitor.interests and $visitor.lottery_numbers ~= $user2.lottery_numbers
+order by $user.uid, $visitor.vid, $user2.uid
+return {'user': $user, 'visitor': $visitor, 'user2': $user2}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-3_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-3_1.aql
new file mode 100644
index 0000000..4de46a5
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-3_1.aql
@@ -0,0 +1,102 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: {{string}}
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: {{string}}
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/users-visitors-small/users.json"),("format"="adm"));
+
+load dataset Visitors
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/users-visitors-small/visitors.json"),("format"="adm"));
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-3_1.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $user in dataset('Users')
+ let $lenUser := len($user.lottery_numbers)
+ let $tokensUser :=
+ for $token in $user.lottery_numbers
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $user in dataset('Users')
+ for $token in $user.lottery_numbers
+ group by $tokenGroupped := $token with $user
+ order by count($user)
+ return $tokenGroupped
+ where $token = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenUser in subset-collection(
+ $tokensUser,
+ 0,
+ prefix-len-jaccard($lenUser, .5f))
+
+ for $visitor in dataset('Visitors')
+ let $lenVisitor := len($visitor.lottery_numbers)
+ let $tokensVisitor :=
+ for $token in $visitor.lottery_numbers
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $user in dataset('Users')
+ for $token in $user.lottery_numbers
+ group by $tokenGroupped := $token with $user
+ order by count($user)
+ return $tokenGroupped
+ where $token = $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenVisitor in subset-collection(
+ $tokensVisitor,
+ 0,
+ prefix-len-jaccard($lenVisitor, .5f))
+
+ where $prefixTokenUser = $prefixTokenVisitor
+
+ let $sim := similarity-jaccard-prefix(
+ $lenUser,
+ $tokensUser,
+ $lenVisitor,
+ $tokensVisitor,
+ $prefixTokenUser,
+ .5f)
+ where $sim >= .5f
+ group by $uid := $user.uid, $vid := $visitor.vid with $sim
+ return {'uid': $uid, 'vid': $vid, 'sim': $sim[0]}
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $ridpair.uid = $user.uid and $ridpair.vid = $visitor.vid
+order by $user.uid, $visitor.vid
+return {'user': $user, 'visitor': $visitor, 'sim': $ridpair.sim}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_1.aql
new file mode 100644
index 0000000..d328366
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_1.aql
@@ -0,0 +1,37 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $user.lottery_numbers ~= $visitor.lottery_numbers
+order by $user.uid, $visitor.vid
+return {'user': $user, 'visitor': $visitor}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_2.aql
new file mode 100644
index 0000000..bf0f287
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_2.aql
@@ -0,0 +1,37 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-aqlplus_2.adm';
+
+set simthreshold '.5f';
+
+for $visitor in dataset('Visitors')
+for $user in dataset('Users')
+where $user.lottery_numbers ~= $visitor.lottery_numbers
+order by $user.uid, $visitor.vid
+return {'user': $user, 'visitor': $visitor}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_3.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_3.aql
new file mode 100644
index 0000000..72cb80c
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_3.aql
@@ -0,0 +1,37 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-aqlplus_3.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $visitor.lottery_numbers ~= $user.lottery_numbers
+order by $user.uid, $visitor.vid
+return {'user': $user, 'visitor': $visitor}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_4.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_4.aql
new file mode 100644
index 0000000..19415d5
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_4.aql
@@ -0,0 +1,38 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-aqlplus_4.adm';
+
+set simfunction 'Jaccard';
+set simthreshold '.6f';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $user.lottery_numbers ~= $visitor.lottery_numbers
+order by $user.uid, $visitor.vid
+return {'user': $user, 'visitor': $visitor}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_5.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_5.aql
new file mode 100644
index 0000000..698b1b5
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-aqlplus_5.aql
@@ -0,0 +1,35 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-aqlplus_5.adm';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $user.lottery_numbers ~= $visitor.lottery_numbers
+order by $user.uid, $visitor.vid
+return {'user': $user, 'visitor': $visitor}
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-int-aqlplus_1.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-int-aqlplus_1.aql
new file mode 100644
index 0000000..6a43fa6
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-int-aqlplus_1.aql
@@ -0,0 +1,39 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-int-aqlplus_1.adm';
+
+set simthreshold '.5f';
+
+for $user in dataset('Users')
+for $visitor in dataset('Visitors')
+where $user.lottery_numbers ~= $visitor.lottery_numbers
+and $user.interests ~= $visitor.interests
+order by $user.uid, $visitor.vid
+return { 'user': $user, 'visitor': $visitor }
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-int-aqlplus_2.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-int-aqlplus_2.aql
new file mode 100644
index 0000000..693191a
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/user-vis-lot-int-aqlplus_2.aql
@@ -0,0 +1,39 @@
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type UserType as open {
+ uid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create type VisitorType as open {
+ vid: int32,
+ name: string,
+ lottery_numbers: [int32],
+ interests: <string>
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset Users(UserType) partitioned by key uid on group1;
+create dataset Visitors(VisitorType) partitioned by key vid on group1;
+
+
+load dataset Users from nc1:'data/users-visitors-small/users.json';
+load dataset Visitors from nc1:'data/users-visitors-small/visitors.json';
+
+write output to nc1:'rttest/fuzzyjoin_user-vis-lot-int-aqlplus_2.adm';
+
+set simthreshold '.5f';
+
+for $visitor in dataset('Visitors')
+for $user in dataset('Users')
+where $user.lottery_numbers ~= $visitor.lottery_numbers
+and $user.interests ~= $visitor.interests
+order by $user.uid, $visitor.vid
+return { 'user': $user, 'visitor': $visitor }