Added runtime tests for fuzzy selection queries based on length-partitioned inverted indexes. git-svn-id: https://asterixdb.googlecode.com/svn/branches/asterix_lsm_length_filter@929 eaa15691-b419-025a-1212-ee371bd00084

commit: a9faa614d52d6ade54957078a5d9b33d4ab42b6b [log] [tgz]
author: alexander.behm <alexander.behm@eaa15691-b419-025a-1212-ee371bd00084> Sat Nov 24 22:38:05 2012 +0000
committer: alexander.behm <alexander.behm@eaa15691-b419-025a-1212-ee371bd00084> Sat Nov 24 22:38:05 2012 +0000
tree: 013bfa5c4146fb46685aabbc03f56faf0901e7b9
parent: b742e0964ae328fb70c441c67e85218903815c12 [diff]
diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-contains.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-contains.aql
new file mode 100644
index 0000000..54ec794
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-contains.aql

@@ -0,0 +1,29 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+load dataset DBLP 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+create index ngram_index on DBLP(title) type fuzzy ngram(3);
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-ngram-contains.adm";
+
+for $o in dataset('DBLP')
+where contains($o.title, "Multimedia")
+order by $o.id
+return $o
\ No newline at end of file

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-edit-distance-panic.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-edit-distance-panic.aql
new file mode 100644
index 0000000..8bfc878
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-edit-distance-panic.aql

@@ -0,0 +1,29 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+load dataset DBLP 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+create index ngram_index on DBLP(authors) type fuzzy ngram(3);
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-ngram-edit-distance-panic.adm";
+
+for $o in dataset('DBLP')
+let $ed := edit-distance-check($o.authors, "Amihay Motro", 5)
+where $ed[0]
+return $o
\ No newline at end of file

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-edit-distance.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-edit-distance.aql
new file mode 100644
index 0000000..6077389
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-edit-distance.aql

@@ -0,0 +1,29 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+load dataset DBLP 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+create index ngram_index on DBLP(authors) type fuzzy ngram(3);
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-ngram-edit-distance.adm";
+
+for $o in dataset('DBLP')
+let $ed := edit-distance-check($o.authors, "Amihay Motro", 1)
+where $ed[0]
+return $o
\ No newline at end of file

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-jaccard.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-jaccard.aql
new file mode 100644
index 0000000..314c3bd
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ngram-jaccard.aql

@@ -0,0 +1,29 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+load dataset DBLP 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+create index ngram_index on DBLP(title) type fuzzy ngram(3);
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-ngram-jaccard.adm";
+
+for $o in dataset('DBLP')
+let $jacc := similarity-jaccard-check(gram-tokens($o.title, 3, false), gram-tokens("Transactions for Cooperative Environments", 3, false), 0.5f)
+where $jacc[0]
+return $o
\ No newline at end of file

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-edit-distance-panic.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-edit-distance-panic.aql
new file mode 100644
index 0000000..3a3a303
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-edit-distance-panic.aql

@@ -0,0 +1,37 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type AddressType as closed {
+  number: int32, 
+  street: string,
+  city: string
+}
+
+create type CustomerType as closed {
+  cid: int32, 
+  name: string,
+  age: int32?,
+  address: AddressType?,
+  interests: [string],
+  children: [ { name: string, age: int32? } ]
+}
+
+create nodegroup group1 if not exists on nc1;
+
+create dataset Customers(CustomerType) 
+  partitioned by key cid on group1;
+
+load dataset Customers 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
+
+create index interests_index on Customers(interests) type fuzzy keyword;
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-olist-edit-distance-panic.adm";
+
+for $c in dataset('Customers')
+let $ed := edit-distance-check($c.interests, ["computers", "wine", "walking"], 3)
+where $ed[0]
+order by $c.cid
+return $c

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-edit-distance.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-edit-distance.aql
new file mode 100644
index 0000000..b1ce7c4
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-edit-distance.aql

@@ -0,0 +1,37 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type AddressType as closed {
+  number: int32, 
+  street: string,
+  city: string
+}
+
+create type CustomerType as closed {
+  cid: int32, 
+  name: string,
+  age: int32?,
+  address: AddressType?,
+  interests: [string],
+  children: [ { name: string, age: int32? } ]
+}
+
+create nodegroup group1 if not exists on nc1;
+
+create dataset Customers(CustomerType) 
+  partitioned by key cid on group1;
+
+load dataset Customers 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
+
+create index interests_index on Customers(interests) type fuzzy keyword;
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-olist-edit-distance.adm";
+
+for $c in dataset('Customers')
+let $ed := edit-distance-check($c.interests, ["computers", "wine", "walking"], 1)
+where $ed[0]
+order by $c.cid
+return $c

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-jaccard.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-jaccard.aql
new file mode 100644
index 0000000..b495ffe
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-olist-jaccard.aql

@@ -0,0 +1,36 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type AddressType as closed {
+  number: int32, 
+  street: string,
+  city: string
+}
+
+create type CustomerType as closed {
+  cid: int32, 
+  name: string,
+  age: int32?,
+  address: AddressType?,
+  interests: [string],
+  children: [ { name: string, age: int32? } ]
+}
+
+create nodegroup group1 if not exists on nc1;
+
+create dataset Customers(CustomerType) 
+  partitioned by key cid on group1;
+
+load dataset Customers 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
+
+create index interests_index on Customers(interests) type fuzzy keyword;
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-olist-jaccard.adm";
+
+for $c in dataset('Customers')
+let $jacc := similarity-jaccard-check($c.interests, ["databases", "computers", "wine"], 0.7f)
+where $jacc[0]
+return $c

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ulist-jaccard.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ulist-jaccard.aql
new file mode 100644
index 0000000..4427369
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-ulist-jaccard.aql

@@ -0,0 +1,36 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type AddressType as closed {
+  number: int32, 
+  street: string,
+  city: string
+}
+
+create type CustomerType as closed {
+  cid: int32, 
+  name: string,
+  age: int32?,
+  address: AddressType?,
+  interests: {{string}},
+  children: [ { name: string, age: int32? } ]
+}
+
+create nodegroup group1 if not exists on nc1;
+
+create dataset Customers(CustomerType) 
+  partitioned by key cid on group1;
+
+load dataset Customers 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/semistructured/co1k/customer.adm"),("format"="adm"));
+
+create index interests_index on Customers(interests) type fuzzy keyword;
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-ulist-jaccard.adm";
+
+for $c in dataset('Customers')
+let $jacc := similarity-jaccard-check($c.interests, ["databases", "computers", "wine"], 0.7f)
+where $jacc[0]
+return $c

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-word-contains.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-word-contains.aql
new file mode 100644
index 0000000..888a381
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-word-contains.aql

@@ -0,0 +1,29 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+load dataset DBLP 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+create index keyword_index on DBLP(title) type fuzzy keyword;
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-word-contains.adm";
+
+for $o in dataset('DBLP')
+where contains($o.title, "Multimedia")
+order by $o.id
+return $o

diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-word-jaccard.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-word-jaccard.aql
new file mode 100644
index 0000000..23c9289
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/fuzzy-inverted-index-word-jaccard.aql

@@ -0,0 +1,30 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+  id: int32, 
+  dblpid: string,
+  title: string,
+  authors: string,
+  misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType) 
+  partitioned by key id on group1;
+
+load dataset DBLP 
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
+create index keyword_index on DBLP(title) type fuzzy keyword;
+
+write output to nc1:"rttest/index-selection_fuzzy-inverted-index-word-jaccard.adm";
+
+for $o in dataset('DBLP')
+let $jacc := similarity-jaccard-check(word-tokens($o.title), word-tokens("Transactions for Cooperative Environments"), 0.5f)
+where $jacc[0]
+return $o
+
commit	a9faa614d52d6ade54957078a5d9b33d4ab42b6b	[log] [tgz]
author	alexander.behm <alexander.behm@eaa15691-b419-025a-1212-ee371bd00084>	Sat Nov 24 22:38:05 2012 +0000
committer	alexander.behm <alexander.behm@eaa15691-b419-025a-1212-ee371bd00084>	Sat Nov 24 22:38:05 2012 +0000
tree	013bfa5c4146fb46685aabbc03f56faf0901e7b9
parent	b742e0964ae328fb70c441c67e85218903815c12 [diff]