Added asterix project git-svn-id: https://asterixdb.googlecode.com/svn/trunk/asterix@12 eaa15691-b419-025a-1212-ee371bd00084

commit: 38b7ca49742d23aa076e4656226d3fa1f4e52ed7 [log] [tgz]
author: vinayakb <vinayakb@eaa15691-b419-025a-1212-ee371bd00084> Mon Mar 05 05:44:15 2012 +0000
committer: vinayakb <vinayakb@eaa15691-b419-025a-1212-ee371bd00084> Mon Mar 05 05:44:15 2012 +0000
tree: e8b3a8565af025577b8d083cc245b8c817fe3745
diff --git a/asterix-app/src/test/resources/fuzzyjoin/dblp/000-1-char-at.aql b/asterix-app/src/test/resources/fuzzyjoin/dblp/000-1-char-at.aql
new file mode 100644
index 0000000..43a595d
--- /dev/null
+++ b/asterix-app/src/test/resources/fuzzyjoin/dblp/000-1-char-at.aql

@@ -0,0 +1,43 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+write output to nc1:"/tmp/dblp.adm";
+
+for $paperR in dataset('DBLP')
+where $paperR.id = 1
+for $authorR in word-tokens($paperR.authors)
+for $paperS in dataset('DBLP')
+for $authorS in word-tokens($paperS.authors)
+where $authorR = $authorS
+return {'R': {'dblpid': $paperR.dblpid, 'authors': $paperR.authors}, 
+        'S': {'dblpid': $paperS.dblpid, 'authors': $paperS.authors}}
+
+/*
+java.lang.RuntimeException: java.lang.IllegalArgumentException
+        at edu.uci.ics.hyracks.control.nc.runtime.OperatorRunnable.run(OperatorRunnable.java:70)
+        at edu.uci.ics.hyracks.control.nc.Stagelet$1.run(Stagelet.java:120)
+        at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
+        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
+        at java.lang.Thread.run(Thread.java:619)
+Caused by: java.lang.IllegalArgumentException
+        at edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils.charAt(StringUtils.java:62)
+        at edu.uci.ics.asterix.jobgen.data.tagged.AqlSchemalessPrinterFactory$1.print(AqlSchemalessPrinterFactory.java:103)
+        at edu.uci.ics.asterix.jobgen.data.tagged.AqlSchemalessPrinterFactory$1.print(AqlSchemalessPrinterFactory.java:112)
+        at edu.uci.ics.aqua.runtime.operators.std.PrinterRuntime.printTuple(PrinterRuntime.java:90)
+        at edu.uci.ics.aqua.runtime.operators.std.PrinterRuntime.nextFrame(PrinterRuntime.java:58)
+        at edu.uci.ics.aqua.runtime.operators.base.AquaMetaOperatorDescriptor$2.nextFrame(AquaMetaOperatorDescriptor.java:123)
+        at edu.uci.ics.hyracks.control.nc.runtime.OperatorRunnable.run(OperatorRunnable.java:62)
+        ... 4 more
+*/

diff --git a/asterix-app/src/test/resources/fuzzyjoin/dblp/010-load.aql b/asterix-app/src/test/resources/fuzzyjoin/dblp/010-load.aql
new file mode 100644
index 0000000..a857935
--- /dev/null
+++ b/asterix-app/src/test/resources/fuzzyjoin/dblp/010-load.aql

@@ -0,0 +1,19 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+load dataset DBLP 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1:///asterix/asterix-app/data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter=":")) pre-sorted;
+

diff --git a/asterix-app/src/test/resources/fuzzyjoin/dblp/020-drop.aql b/asterix-app/src/test/resources/fuzzyjoin/dblp/020-drop.aql
new file mode 100644
index 0000000..65213dd
--- /dev/null
+++ b/asterix-app/src/test/resources/fuzzyjoin/dblp/020-drop.aql

@@ -0,0 +1,16 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+drop dataset DBLP;

diff --git a/asterix-app/src/test/resources/fuzzyjoin/dblp/030-filter.aql b/asterix-app/src/test/resources/fuzzyjoin/dblp/030-filter.aql
new file mode 100644
index 0000000..b539aba
--- /dev/null
+++ b/asterix-app/src/test/resources/fuzzyjoin/dblp/030-filter.aql

@@ -0,0 +1,20 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+write output to nc1:'/tmp/dblp.adm';
+
+for $paper in dataset('DBLP')
+where $paper.id = 1
+return { 'dblp': $paper }

diff --git a/asterix-app/src/test/resources/fuzzyjoin/dblp/040-self-join-aql.aql b/asterix-app/src/test/resources/fuzzyjoin/dblp/040-self-join-aql.aql
new file mode 100644
index 0000000..63e1a1e
--- /dev/null
+++ b/asterix-app/src/test/resources/fuzzyjoin/dblp/040-self-join-aql.aql

@@ -0,0 +1,86 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+write output to nc1:'/tmp/dblp.adm';
+
+//
+// -- - Stage 3 - --
+//
+for $ridpair in 
+    //
+    // -- - Stage 2 - --
+    //
+    for $paperR in dataset('DBLP')
+    let $tokensR :=
+        for $word in counthashed-word-tokens($paperR.title)
+        for $token at $i in
+            //
+            // -- - Stage 1 - --
+            //
+            for $paper in dataset('DBLP')
+            for $word in counthashed-word-tokens($paper.title)
+            group by $item := $word with $paper
+            order by count($paper)
+            return $item
+        where $word = $token
+        order by $i
+        return $i
+    for $prefix_tokenR in subset-collection(
+                                $tokensR, 
+                                0,
+                                prefix-len(
+                                    len($tokensR), 'Jaccard', .5))
+
+    for $paperS in dataset('DBLP')
+    let $tokensS :=
+        for $word in counthashed-word-tokens($paperS.title)
+        for $token at $i in
+            //
+            // -- - Stage 1 - --
+            //
+            for $paper in dataset('DBLP')
+            for $word in counthashed-word-tokens($paper.title)
+            group by $item := $word with $paper
+            order by count($paper)
+            return $item
+        where $word = $token
+        order by $i
+        return $i
+    for $prefix_tokenS in subset-collection(
+                                $tokensS, 
+                                0,
+                                prefix-len(
+                                    len($tokensS), 'Jaccard', .5))
+
+    where $prefix_tokenR = $prefix_tokenS
+
+    let $sim := similarity(
+                    len(counthashed-word-tokens($paperR.title)),
+                    $tokensR,
+                    len(counthashed-word-tokens($paperS.title)),
+                    $tokensS,
+                    $prefix_tokenR,
+                    'Jaccard',
+                    .5)
+    where $sim >= .5 and $paperR.id < $paperS.id
+    group by $idR := $paperR.id, $idS := $paperS.id with $sim
+    return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
+
+for $paperR in dataset('DBLP')
+for $paperS in dataset('DBLP')
+where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
+return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
+         'S': { 'dblpid': $paperS.dblpid, 'title': $paperS.title },
+         'sim': $ridpair.sim }

diff --git a/asterix-app/src/test/resources/fuzzyjoin/dblp/050-self-join-op.aql b/asterix-app/src/test/resources/fuzzyjoin/dblp/050-self-join-op.aql
new file mode 100644
index 0000000..f2467f2
--- /dev/null
+++ b/asterix-app/src/test/resources/fuzzyjoin/dblp/050-self-join-op.aql

@@ -0,0 +1,24 @@
+use dataverse fuzzy1;
+
+declare type DBLPType as open {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+declare nodegroup group1 on nc1, nc2;
+
+declare dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+write output to nc1:'/tmp/dblp.adm';
+
+set simthreshold '.5';
+
+for $paperR in dataset('DBLP')
+for $paperS in dataset('DBLP')
+where $paperR.title ~= $paperS.title and $paperR.id < $paperS.id
+return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
+         'S': { 'dblpid': $paperS.dblpid, 'title': $paperS.title }}
commit	38b7ca49742d23aa076e4656226d3fa1f4e52ed7	[log] [tgz]
author	vinayakb <vinayakb@eaa15691-b419-025a-1212-ee371bd00084>	Mon Mar 05 05:44:15 2012 +0000
committer	vinayakb <vinayakb@eaa15691-b419-025a-1212-ee371bd00084>	Mon Mar 05 05:44:15 2012 +0000
tree	e8b3a8565af025577b8d083cc245b8c817fe3745