blob: 1c88536829b225acb5bd039055bdde014b1776ec [file] [log] [blame]
alexander.behmc576c602012-07-06 02:41:15 +00001/*
2 * Description : Fuzzy joins two datasets, Customers and Customers2, based on the edit-distance function of their names.
3 * Customers has a 3-gram index on name, and we expect the join to be transformed into an indexed nested-loop join.
4 * Success : Yes
5 */
6
7drop dataverse test if exists;
8create dataverse test;
9use dataverse test;
10
11create type AddressType as open {
12 number: int32,
13 street: string,
14 city: string
15}
16
17create type CustomerType as open {
18 cid: int32,
19 name: string,
20 age: int32?,
21 address: AddressType?,
22 interests: [string],
23 children: [ { name: string, age: int32? } ]
24}
25
26create dataset Customers(CustomerType) partitioned by key cid;
27
28create dataset Customers2(CustomerType) partitioned by key cid;
29
30load dataset Customers
31using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
32(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
33
34load dataset Customers2
35using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
36(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
37
38create index ngram_index on Customers(name) type ngram(3);
39
alexander.behm954083a2012-11-15 04:01:23 +000040write output to nc1:"rttest/inverted-index-join_ngram-edit-distance.adm";
alexander.behmc576c602012-07-06 02:41:15 +000041
42for $a in dataset('Customers')
43for $b in dataset('Customers2')
44where edit-distance($a.name, $b.name) <= 4 and $a.cid < $b.cid
45order by $a.cid, $b.cid
46return { "arec": $a, "brec": $b }