blob: bf6bb82da987bd0738d2164795015faa7a5c68d2 [file] [log] [blame]
vinayakb5ee049d2013-04-06 21:21:29 +00001/*
2 * Description : Fuzzy self joins a dataset, DBLP, based on the similarity-jaccard function of its titles' 3-gram tokens.
3 * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join.
4 * We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index.
5 * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary.
6 * Success : Yes
7 */
8
9drop dataverse test if exists;
10create dataverse test;
11use dataverse test;
12
13create type DBLPType as closed {
14 id: int32,
15 dblpid: string,
16 title: string,
17 authors: string,
18 misc: string
19}
20
21create dataset DBLP(DBLPType) primary key id;
22
23create index ngram_index on DBLP(title) type ngram(3);
24
25write output to nc1:"rttest/inverted-index-join-noeqjoin_ngram-jaccard-inline.adm";
26
27for $a in dataset('DBLP')
28for $b in dataset('DBLP')
29let $jacc := similarity-jaccard(gram-tokens($a.title, 3, false), gram-tokens($b.title, 3, false))
30where $jacc >= 0.5f and $a.id < $b.id
31return {"atitle": $a.title, "btitle": $b.title, "jacc": $jacc}