blob: f41892119e795be44a3d70987250d3dff1bfee20 [file] [log] [blame]
Ian Maxon857dc132015-09-25 17:13:19 -07001/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
vinayakb38b7ca42012-03-05 05:44:15 +000019use dataverse fuzzy1;
20
21declare type DBLPType as open {
22 id: int32,
23 dblpid: string,
24 title: string,
25 authors: string,
26 misc: string
27}
28
29declare nodegroup group1 on nc1, nc2;
30
31declare dataset DBLP(DBLPType)
ramangrover29669d8f62013-02-11 06:03:32 +000032 primary key id on group1;
vinayakb38b7ca42012-03-05 05:44:15 +000033
34write output to nc1:'/tmp/dblp.adm';
35
36//
37// -- - Stage 3 - --
38//
39for $ridpair in
40 //
41 // -- - Stage 2 - --
42 //
43 for $paperR in dataset('DBLP')
44 let $tokensR :=
45 for $word in counthashed-word-tokens($paperR.title)
46 for $token at $i in
47 //
48 // -- - Stage 1 - --
49 //
50 for $paper in dataset('DBLP')
51 for $word in counthashed-word-tokens($paper.title)
52 group by $item := $word with $paper
53 order by count($paper)
54 return $item
55 where $word = $token
56 order by $i
57 return $i
58 for $prefix_tokenR in subset-collection(
59 $tokensR,
60 0,
61 prefix-len(
62 len($tokensR), 'Jaccard', .5))
63
64 for $paperS in dataset('DBLP')
65 let $tokensS :=
66 for $word in counthashed-word-tokens($paperS.title)
67 for $token at $i in
68 //
69 // -- - Stage 1 - --
70 //
71 for $paper in dataset('DBLP')
72 for $word in counthashed-word-tokens($paper.title)
73 group by $item := $word with $paper
74 order by count($paper)
75 return $item
76 where $word = $token
77 order by $i
78 return $i
79 for $prefix_tokenS in subset-collection(
80 $tokensS,
81 0,
82 prefix-len(
83 len($tokensS), 'Jaccard', .5))
84
85 where $prefix_tokenR = $prefix_tokenS
86
87 let $sim := similarity(
88 len(counthashed-word-tokens($paperR.title)),
89 $tokensR,
90 len(counthashed-word-tokens($paperS.title)),
91 $tokensS,
92 $prefix_tokenR,
93 'Jaccard',
94 .5)
95 where $sim >= .5 and $paperR.id < $paperS.id
96 group by $idR := $paperR.id, $idS := $paperS.id with $sim
97 return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
98
99for $paperR in dataset('DBLP')
100for $paperS in dataset('DBLP')
101where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
102return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
103 'S': { 'dblpid': $paperS.dblpid, 'title': $paperS.title },
104 'sim': $ridpair.sim }