Added asterix project
git-svn-id: https://asterixdb.googlecode.com/svn/trunk@12 eaa15691-b419-025a-1212-ee371bd00084
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx-small.aql b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx-small.aql
new file mode 100644
index 0000000..f7dcf0e
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx-small.aql
@@ -0,0 +1,20 @@
+use dataverse fuzzy1;
+
+declare type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset CSXSmall(CSXType)
+ partitioned by key id on group1;
+
+load dataset CSXSmall
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/fuzzyjoin/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx.aql b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx.aql
new file mode 100644
index 0000000..b5787a1
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx.aql
@@ -0,0 +1,20 @@
+use dataverse fuzzy1;
+
+declare type CSXType as open {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset CSX(CSXType)
+ partitioned by key id on group1;
+
+load dataset CSX
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/fuzzyjoin/pub/csx-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-dblp.aql b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-dblp.aql
new file mode 100644
index 0000000..3ca85bb
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-dblp.aql
@@ -0,0 +1,20 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+declare nodegroup group1 on nc1;
+
+declare dataset DBLP(DBLPType)
+ partitioned by key id on group1;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/fuzzyjoin/dblp/dblp-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/20-drop-dblp.aql b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/20-drop-dblp.aql
new file mode 100644
index 0000000..868d534
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/20-drop-dblp.aql
@@ -0,0 +1,17 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType)
+ partitioned by key id on group1;
+
+drop dataset DBLP;
+
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/30-filter-dblp.aql b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/30-filter-dblp.aql
new file mode 100644
index 0000000..45bd1c8
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/30-filter-dblp.aql
@@ -0,0 +1,20 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType)
+ partitioned by key id on group1;
+
+write output to nc1:"/tmp/amerix.adm";
+
+for $dblp in dataset('DBLP')
+where $dblp.id = 1
+return $dblp
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/40-self-join-dblp.aql b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/40-self-join-dblp.aql
new file mode 100644
index 0000000..bc024c3
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/40-self-join-dblp.aql
@@ -0,0 +1,86 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+declare nodegroup group1 on nc1;
+
+declare dataset DBLP(DBLPType)
+ partitioned by key id on group1;
+
+write output to nc1:"/tmp/amerix.adm";
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in
+ //
+ // -- - Stage 2 - --
+ //
+ for $paperR in dataset('DBLP')
+ let $tokensR :=
+ for $word in counthashed-word-tokens($paperR.title)
+ for $token at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $word in counthashed-word-tokens($paper.title)
+ group by $item := $word with $paper
+ order by count($paper)
+ return $item
+ where $word = $token
+ order by $i
+ return $i
+ for $prefix_tokenR in subset-collection(
+ $tokensR,
+ 0,
+ prefix-len(
+ len($tokensR), 'Jaccard', .8))
+
+ for $paperS in dataset('DBLP')
+ let $tokensS :=
+ for $word in counthashed-word-tokens($paperS.title)
+ for $token at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('DBLP')
+ for $word in counthashed-word-tokens($paper.title)
+ group by $item := $word with $paper
+ order by count($paper)
+ return $item
+ where $word = $token
+ order by $i
+ return $i
+ for $prefix_tokenS in subset-collection(
+ $tokensS,
+ 0,
+ prefix-len(
+ len($tokensS), 'Jaccard', .8))
+
+ where $prefix_tokenR = $prefix_tokenS
+
+ let $sim := similarity(
+ len(counthashed-word-tokens($paperR.title)),
+ $tokensR,
+ len(counthashed-word-tokens($paperS.title)),
+ $tokensS,
+ $prefix_tokenR,
+ 'Jaccard',
+ .8)
+ where $sim >= .8 and $paperR.id < $paperS.id
+ group by $idR := $paperR.id, $idS := $paperS.id with $sim
+ return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
+
+for $paperR in dataset('DBLP')
+for $paperS in dataset('DBLP')
+where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
+return {'R': {'dblpid': $paperR.dblpid, 'title': $paperR.title},
+ 'S': {'dblpid': $paperS.dblpid, 'title': $paperS.title},
+ 'sim': $ridpair.sim}
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/50-self-join-dblp.aql b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/50-self-join-dblp.aql
new file mode 100644
index 0000000..eb44be7
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/50-self-join-dblp.aql
@@ -0,0 +1,22 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType)
+ partitioned by key id on group1;
+
+write output to nc1:"/tmp/amerix.adm";
+
+for $paperR in dataset('DBLP')
+for $paperS in dataset('DBLP')
+where $paperR.title ~= $paperS.title
+return { 'R': $paperR,
+ 'S': $paperS }
diff --git a/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/line.py b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/line.py
new file mode 100644
index 0000000..f845bb2
--- /dev/null
+++ b/asterix/asterix-app/src/test/resources/fuzzyjoin/amerix/line.py
@@ -0,0 +1,7 @@
+
+m = 0
+for line in open('/data/fuzzyjoin/pub/csx-id.txt'):
+ l = len(line)
+ if (l > m):
+ m = l
+print m