blob: 5d61edecf3f17c31d665eff94d7b6f0ed8c5cf50 [file] [log] [blame]
icetindil9d0004a2014-04-28 14:16:49 -07001/*
2 * Description : Fuzzy joins two datasets, DBLP and CSX, based on the edit-distance-contains function of their authors.
3 * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join.
4 * Success : Yes
5 */
6
7drop dataverse test if exists;
8create dataverse test;
9use dataverse test;
10
11create type DBLPType as closed {
12 id: int32,
13 dblpid: string,
14 title: string,
15 authors: string,
16 misc: string
17}
18
19create type CSXType as closed {
20 id: int32,
21 csxid: string,
22 title: string,
23 authors: string,
24 misc: string
25}
26
27create dataset DBLP(DBLPType) primary key id;
28
29create dataset CSX(CSXType) primary key id;
30
31create index ngram_index on DBLP(authors) type ngram(3);
32
33write output to nc1:"rttest/inverted-index-join_ngram-edit-distance-contains.adm";
34
35for $a in dataset('DBLP')
36for $b in dataset('CSX')
37where edit-distance-contains($a.authors, $b.authors, 3)[0] and $a.id < $b.id
38return {"arec": $a, "brec": $b }