blob: 05304c59200a06d6450e81c2d74fcecdacd27cb1 [file] [log] [blame]
vinayakb38b7ca42012-03-05 05:44:15 +00001drop dataverse fuzzyjoin if exists;
2
3create dataverse fuzzyjoin;
4
5use dataverse fuzzyjoin;
6
7create type DBLPType as closed {
8 id: int32,
9 dblpid: string,
10 title: string,
11 authors: string,
12 misc: string
13}
14
15create type TOKENSRANKEDADMType as closed {
16 token: int32,
17 rank: int32
18}
19
khfaraaz82@gmail.com31086b52012-05-02 21:33:58 +000020create dataset DBLP(DBLPType) partitioned by key id;
21create dataset TOKENSRANKEDADM(TOKENSRANKEDADMType) partitioned by key rank;
vinayakb38b7ca42012-03-05 05:44:15 +000022
23load dataset DBLP
24using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
25(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
26
27load dataset TOKENSRANKEDADM
28using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
29(("path"="nc1://data/pub-small/tokensranked.adm"),("format"="adm"));
30
31write output to nc1:'rttest/fuzzyjoin_dblp-2.2.adm';
32
33 //
34 // -- - Stage 2 - --
35 //
36 for $paperDBLP in dataset('DBLP')
37 let $idDBLP := $paperDBLP.id
38 let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
39 let $lenDBLP := len($tokensUnrankedDBLP)
40 let $tokensDBLP :=
41 for $tokenUnranked in $tokensUnrankedDBLP
42 for $tokenRanked in dataset('TOKENSRANKEDADM')
43 where $tokenUnranked = /*+ bcast*/ $tokenRanked.token
44 order by $tokenRanked.rank
45 return $tokenRanked.rank
46 for $prefixTokenDBLP in subset-collection(
47 $tokensDBLP,
48 0,
49 prefix-len-jaccard(len($tokensDBLP), .5f))
50 order by $idDBLP, $prefixTokenDBLP
51 return {'id': $idDBLP, 'prefixToken': $prefixTokenDBLP, 'tokens': $tokensDBLP}