| /* |
| * Description : Fuzzy joins two datasets, Customers and Customers2, based on the edit-distance function of their names. |
| * Customers has a 3-gram index on name, and we expect the join to be transformed into an indexed nested-loop join. |
| * We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index. |
| * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary. |
| * Success : Yes |
| */ |
| |
| drop dataverse test if exists; |
| create dataverse test; |
| use dataverse test; |
| |
| create type AddressType as open { |
| number: int32, |
| street: string, |
| city: string |
| } |
| |
| create type CustomerType as open { |
| cid: int32, |
| name: string, |
| age: int32?, |
| address: AddressType?, |
| interests: [string], |
| children: [ { name: string, age: int32? } ] |
| } |
| |
| create dataset Customers(CustomerType) primary key cid; |
| |
| create dataset Customers2(CustomerType) primary key cid; |
| |
| load dataset Customers |
| using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter" |
| (("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm")); |
| |
| load dataset Customers2 |
| using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter" |
| (("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm")); |
| |
| create index ngram_index on Customers(name) type ngram(3); |
| |
| write output to nc1:"rttest/inverted-index-join-noeqjoin_ngram-edit-distance-inline.adm"; |
| |
| for $a in dataset('Customers') |
| for $b in dataset('Customers2') |
| let $ed := edit-distance($a.name, $b.name) |
| where $ed <= 4 and $a.cid < $b.cid |
| order by $ed, $a.cid, $b.cid |
| return { "a": $a.name, "b": $b.name, "ed": $ed } |