blob: 0e15708c393c39c825ac7c2c94d82a9e87d6057c [file] [log] [blame]
vinayakb5ee049d2013-04-06 21:21:29 +00001/*
2 * Description : Fuzzy joins two datasets, DBLP and CSX, based on ~= using edit distance of their authors.
3 * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join.
4 * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary.
5 * Success : Yes
6 */
7
8drop dataverse test if exists;
9create dataverse test;
10use dataverse test;
11
12create type DBLPType as closed {
13 id: int32,
14 dblpid: string,
15 title: string,
16 authors: string,
17 misc: string
18}
19
20create type CSXType as closed {
21 id: int32,
22 csxid: string,
23 title: string,
24 authors: string,
25 misc: string
26}
27
28create dataset DBLP(DBLPType) primary key id;
29
30create dataset CSX(CSXType) primary key id;
31
32create index ngram_index on CSX(authors) type ngram(3);
33
34write output to nc1:"rttest/inverted-index-join-noeqjoin_ngram-fuzzyeq-edit-distance.adm";
35
36set simfunction 'edit-distance';
37set simthreshold '3';
38
39for $a in dataset('DBLP')
40for $b in dataset('CSX')
41where $a.authors ~= $b.authors and $a.id < $b.id
42return {"aauthors": $a.authors, "bauthors": $b.authors}