blob: 46cd05c0ba362b68169919bc94b62bcc33e140ae [file] [log] [blame]
alexander.behm417fb9b2012-11-15 19:36:44 +00001/*
2 * Description : Fuzzy joins two datasets, Customers and Customers2, based on the edit-distance function of their names.
3 * Customers has a 3-gram index on name, and we expect the join to be transformed into an indexed nested-loop join.
4 * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary.
5 * Success : Yes
6 */
7
8drop dataverse test if exists;
9create dataverse test;
10use dataverse test;
11
12create type AddressType as open {
13 number: int32,
14 street: string,
15 city: string
16}
17
18create type CustomerType as open {
19 cid: int32,
20 name: string,
21 age: int32?,
22 address: AddressType?,
23 interests: [string],
24 children: [ { name: string, age: int32? } ]
25}
26
ramangrover29669d8f62013-02-11 06:03:32 +000027create dataset Customers(CustomerType) primary key cid;
alexander.behm417fb9b2012-11-15 19:36:44 +000028
ramangrover29669d8f62013-02-11 06:03:32 +000029create dataset Customers2(CustomerType) primary key cid;
alexander.behm417fb9b2012-11-15 19:36:44 +000030
31load dataset Customers
32using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
33(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
34
35load dataset Customers2
36using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
37(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
38
39create index ngram_index on Customers(name) type ngram(3);
40
41write output to nc1:"rttest/inverted-index-join-noeqjoin_ngram-edit-distance.adm";
42
43for $a in dataset('Customers')
44for $b in dataset('Customers2')
45where edit-distance($a.name, $b.name) <= 4 and $a.cid < $b.cid
46order by $a.cid, $b.cid
47return { "a": $a.name, "b": $b.name }