blob: bc024c3fe40e001bb0a9ad95120b204fec56c79c [file] [log] [blame]
vinayakb38b7ca42012-03-05 05:44:15 +00001use dataverse fuzzy1;
2
3declare type DBLPType as open {
4 id: int32,
5 dblpid: string,
6 title: string,
7 authors: string,
8 misc: string
9}
10
11declare nodegroup group1 on nc1;
12
13declare dataset DBLP(DBLPType)
14 partitioned by key id on group1;
15
16write output to nc1:"/tmp/amerix.adm";
17
18//
19// -- - Stage 3 - --
20//
21for $ridpair in
22 //
23 // -- - Stage 2 - --
24 //
25 for $paperR in dataset('DBLP')
26 let $tokensR :=
27 for $word in counthashed-word-tokens($paperR.title)
28 for $token at $i in
29 //
30 // -- - Stage 1 - --
31 //
32 for $paper in dataset('DBLP')
33 for $word in counthashed-word-tokens($paper.title)
34 group by $item := $word with $paper
35 order by count($paper)
36 return $item
37 where $word = $token
38 order by $i
39 return $i
40 for $prefix_tokenR in subset-collection(
41 $tokensR,
42 0,
43 prefix-len(
44 len($tokensR), 'Jaccard', .8))
45
46 for $paperS in dataset('DBLP')
47 let $tokensS :=
48 for $word in counthashed-word-tokens($paperS.title)
49 for $token at $i in
50 //
51 // -- - Stage 1 - --
52 //
53 for $paper in dataset('DBLP')
54 for $word in counthashed-word-tokens($paper.title)
55 group by $item := $word with $paper
56 order by count($paper)
57 return $item
58 where $word = $token
59 order by $i
60 return $i
61 for $prefix_tokenS in subset-collection(
62 $tokensS,
63 0,
64 prefix-len(
65 len($tokensS), 'Jaccard', .8))
66
67 where $prefix_tokenR = $prefix_tokenS
68
69 let $sim := similarity(
70 len(counthashed-word-tokens($paperR.title)),
71 $tokensR,
72 len(counthashed-word-tokens($paperS.title)),
73 $tokensS,
74 $prefix_tokenR,
75 'Jaccard',
76 .8)
77 where $sim >= .8 and $paperR.id < $paperS.id
78 group by $idR := $paperR.id, $idS := $paperS.id with $sim
79 return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
80
81for $paperR in dataset('DBLP')
82for $paperS in dataset('DBLP')
83where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
84return {'R': {'dblpid': $paperR.dblpid, 'title': $paperR.title},
85 'S': {'dblpid': $paperS.dblpid, 'title': $paperS.title},
86 'sim': $ridpair.sim}