blob: 9077f339162e34d9fec927e4c44c3383490689eb [file] [log] [blame]
vinayakb38b7ca42012-03-05 05:44:15 +00001use dataverse fuzzy1;
2
3declare type DBLPType as open {
4 id: int32,
5 dblpid: string,
6 title: string,
7 authors: string,
8 misc: string
9}
10
11declare nodegroup group1 on rainbow-01, rainbow-02, rainbow-03,
12rainbow-04, rainbow-05;
13
14declare dataset DBLP(DBLPType)
15 partitioned by key id on group1;
16
17write output to rainbow-01:"/home/hyracks/dblp-self-join.adm";
18
19//
20// -- - Stage 3 - --
21//
22for $ridpair in
23 //
24 // -- - Stage 2 - --
25 //
26 for $paperR in dataset('DBLP')
27 let $tokensR :=
28 for $word in counthashed-word-tokens($paperR.title)
29 for $token at $i in
30 //
31 // -- - Stage 1 - --
32 //
33 for $paper in dataset('DBLP')
34 for $word in counthashed-word-tokens($paper.title)
35 group by $item := $word with $paper
36 order by count($paper)
37 return $item
38 where $word = $token
39 order by $i
40 return $i
41 for $prefix_tokenR in subset-collection(
42 $tokensR,
43 0,
44 prefix-len(
45 len($tokensR), 'Jaccard', .8))
46
47 for $paperS in dataset('DBLP')
48 let $tokensS :=
49 for $word in counthashed-word-tokens($paperS.title)
50 for $token at $i in
51 //
52 // -- - Stage 1 - --
53 //
54 for $paper in dataset('DBLP')
55 for $word in counthashed-word-tokens($paper.title)
56 group by $item := $word with $paper
57 order by count($paper)
58 return $item
59 where $word = $token
60 order by $i
61 return $i
62 for $prefix_tokenS in subset-collection(
63 $tokensS,
64 0,
65 prefix-len(
66 len($tokensS), 'Jaccard', .8))
67
68 where $prefix_tokenR = $prefix_tokenS
69
70 let $sim := similarity(
71 len(counthashed-word-tokens($paperR.title)),
72 $tokensR,
73 len(counthashed-word-tokens($paperS.title)),
74 $tokensS,
75 $prefix_tokenR,
76 'Jaccard',
77 .8)
78 where $sim >= .8 // and $paperR.id != $paperS.id
79 group by $idR := $paperR.id, $idS := $paperS.id with $sim
80 where $idR < $idS
81 return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
82
83for $paperR in dataset('DBLP')
84for $paperS in dataset('DBLP')
85where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
86return {'R': {'dblpid': $paperR.dblpid, 'title': $paperR.title},
87 'S': {'dblpid': $paperS.dblpid, 'title': $paperS.title},
88 'sim': $ridpair.sim}