blob: 0ea267c7da5f362270362b8ba60d4b5caff57d71 [file] [log] [blame]
alexander.behm417fb9b2012-11-15 19:36:44 +00001/*
2 * Description : Fuzzy joins two datasets, Customers and Customers2, based on the edit-distance function of their names.
3 * Customers has a 3-gram index on name, and we expect the join to be transformed into an indexed nested-loop join.
4 * We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index.
5 * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary.
6 * Success : Yes
7 */
8
9drop dataverse test if exists;
10create dataverse test;
11use dataverse test;
12
13create type AddressType as open {
14 number: int32,
15 street: string,
16 city: string
17}
18
19create type CustomerType as open {
20 cid: int32,
21 name: string,
22 age: int32?,
23 address: AddressType?,
24 interests: [string],
25 children: [ { name: string, age: int32? } ]
26}
27
28create dataset Customers(CustomerType) partitioned by key cid;
29
30create dataset Customers2(CustomerType) partitioned by key cid;
31
32load dataset Customers
33using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
34(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
35
36load dataset Customers2
37using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
38(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
39
40create index ngram_index on Customers(name) type ngram(3);
41
42write output to nc1:"rttest/inverted-index-join-noeqjoin_ngram-edit-distance-inline.adm";
43
44for $a in dataset('Customers')
45for $b in dataset('Customers2')
46let $ed := edit-distance($a.name, $b.name)
47where $ed <= 4 and $a.cid < $b.cid
48order by $ed, $a.cid, $b.cid
49return { "a": $a.name, "b": $b.name, "ed": $ed }