blob: ec8867988d49f2fb45636b19109767e908eb9299 [file] [log] [blame]
vinayakb38b7ca42012-03-05 05:44:15 +00001use dataverse fuzzy1;
2
3declare type DBLPType as open {
4 id: int32,
5 dblpid: string,
6 title: string,
7 authors: string,
8 misc: string
9}
10
11declare type CSXType as open {
12 id: int32,
13 csxid: string,
14 title: string,
15 authors: string,
16 misc: string
17}
18
19declare nodegroup group1 on nc1, nc2;
20
21declare dataset DBLP(DBLPType)
22 partitioned by key id on group1;
23
24declare dataset CSX(CSXType)
25 partitioned by key id on group1;
26
27write output to nc1:'/tmp/pub.adm';
28
29//
30// -- - Stage 3 - --
31//
32for $ridpair in
33 //
34 // -- - Stage 2 - --
35 //
36 for $paperR in dataset('DBLP')
37 let $lenR := len(counthashed-word-tokens($paperR.title))
38 let $tokensR :=
39 for $word in counthashed-word-tokens($paperR.title)
40 for $token at $i in
41 //
42 // -- - Stage 1 - --
43 //
44 for $paper in dataset('DBLP')
45 for $word in counthashed-word-tokens($paper.title)
46 group by $item := $word with $paper
47 order by count($paper)
48 return $item
49 where $word = $token
50 order by $i
51 return $i
52 for $prefix_tokenR in subset-collection(
53 $tokensR,
54 0,
55 prefix-len($lenR, 'Jaccard', .5))
56
57 for $paperS in dataset('CSX')
58 let $lenS := len(counthashed-word-tokens($paperS.title))
59 let $tokensS :=
60 for $word in counthashed-word-tokens($paperS.title)
61 for $token at $i in
62 //
63 // -- - Stage 1 - --
64 //
65 for $paper in dataset('DBLP')
66 for $word in counthashed-word-tokens($paper.title)
67 group by $item := $word with $paper
68 order by count($paper)
69 return $item
70 where $word = $token
71 order by $i
72 return $i
73 for $prefix_tokenS in subset-collection(
74 $tokensS,
75 0,
76 prefix-len($lenS, 'Jaccard', .5))
77
78 where $prefix_tokenR = $prefix_tokenS
79
80 let $sim := similarity(
81 $lenR,
82 $tokensR,
83 $lenS,
84 $tokensS,
85 $prefix_tokenR,
86 'Jaccard',
87 .5)
88 where $sim >= .5
89 group by $idR := $paperR.id, $idS := $paperS.id with $sim
90 return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
91
92for $paperR in dataset('DBLP')
93for $paperS in dataset('CSX')
94where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
95return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
96 'S': { 'csxid': $paperS.csxid, 'title': $paperS.title },
97 'sim': $ridpair.sim }