blob: ce8dae4c640fd32247777b8ead14e12a440c7fb9 [file] [log] [blame]
vinayakb38b7ca42012-03-05 05:44:15 +00001drop dataverse fuzzyjoin if exists;
2
3create dataverse fuzzyjoin;
4
5use dataverse fuzzyjoin;
6
7create type DBLPType as closed {
8 id: int32,
9 dblpid: string,
10 title: string,
11 authors: string,
12 misc: string
13}
14
15create type TOKENSRANKEDADMType as closed {
16 token: int32,
17 rank: int32
18}
19
20create nodegroup group1 if not exists on nc1, nc2;
21
22create dataset DBLP(DBLPType) partitioned by key id on group1;
23create dataset TOKENSRANKEDADM(TOKENSRANKEDADMType) partitioned by key rank on group1;
24
25load dataset DBLP
26using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
27(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
28
29load dataset TOKENSRANKEDADM
30using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
31(("path"="nc1://data/pub-small/tokensranked.adm"),("format"="adm"));
32
33write output to nc1:'rttest/fuzzyjoin_dblp-2.2.adm';
34
35 //
36 // -- - Stage 2 - --
37 //
38 for $paperDBLP in dataset('DBLP')
39 let $idDBLP := $paperDBLP.id
40 let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
41 let $lenDBLP := len($tokensUnrankedDBLP)
42 let $tokensDBLP :=
43 for $tokenUnranked in $tokensUnrankedDBLP
44 for $tokenRanked in dataset('TOKENSRANKEDADM')
45 where $tokenUnranked = /*+ bcast*/ $tokenRanked.token
46 order by $tokenRanked.rank
47 return $tokenRanked.rank
48 for $prefixTokenDBLP in subset-collection(
49 $tokensDBLP,
50 0,
51 prefix-len-jaccard(len($tokensDBLP), .5f))
52 order by $idDBLP, $prefixTokenDBLP
53 return {'id': $idDBLP, 'prefixToken': $prefixTokenDBLP, 'tokens': $tokensDBLP}