blob: 6f6986668cc78c95fd66796ca08fd44e2cbe0e97 [file] [log] [blame]
alexander.behmc576c602012-07-06 02:41:15 +00001/*
2 * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' 3-gram tokens.
3 * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join.
4 * Success : Yes
5 */
6
7drop dataverse test if exists;
8create dataverse test;
9use dataverse test;
10
11create type DBLPType as closed {
12 id: int32,
13 dblpid: string,
14 title: string,
15 authors: string,
16 misc: string
17}
18
19create type CSXType as closed {
20 id: int32,
21 csxid: string,
22 title: string,
23 authors: string,
24 misc: string
25}
26
27create dataset DBLP(DBLPType) partitioned by key id;
28
29create dataset CSX(CSXType) partitioned by key id;
30
31load dataset DBLP
32using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
33(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
34
35load dataset CSX
36using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
37(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
38
39create index ngram_index on DBLP(title) type ngram(3);
40
41write output to nc1:"rttest/index-join_inverted-index-ngram-jaccard.adm";
42
43for $a in dataset('DBLP')
44for $b in dataset('CSX')
45where similarity-jaccard(gram-tokens($a.title, 3, false), gram-tokens($b.title, 3, false)) >= 0.5f
46 and $a.id < $b.id
47order by $a.id, $b.id
48return { "arec": $a.title, "brec": $b.title }