blob: 7171e4320c1bce872945026743ecfd69d649284a [file] [log] [blame]
Ian Maxon857dc132015-09-25 17:13:19 -07001/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
vinayakb38b7ca42012-03-05 05:44:15 +000019use dataverse fuzzy1;
20
21declare type DBLPType as open {
22 id: int32,
23 dblpid: string,
24 title: string,
25 authors: string,
26 misc: string
27}
28
29declare nodegroup group1 on rainbow-01, rainbow-02, rainbow-03,
30rainbow-04, rainbow-05;
31
32declare dataset DBLP(DBLPType)
ramangrover29669d8f62013-02-11 06:03:32 +000033 primary key id on group1;
vinayakb38b7ca42012-03-05 05:44:15 +000034
35write output to rainbow-01:"/home/hyracks/dblp-self-join.adm";
36
37//
38// -- - Stage 3 - --
39//
40for $ridpair in
41 //
42 // -- - Stage 2 - --
43 //
44 for $paperR in dataset('DBLP')
45 let $tokensR :=
46 for $word in counthashed-word-tokens($paperR.title)
47 for $token at $i in
48 //
49 // -- - Stage 1 - --
50 //
51 for $paper in dataset('DBLP')
52 for $word in counthashed-word-tokens($paper.title)
53 group by $item := $word with $paper
54 order by count($paper)
55 return $item
56 where $word = $token
57 order by $i
58 return $i
59 for $prefix_tokenR in subset-collection(
60 $tokensR,
61 0,
62 prefix-len(
63 len($tokensR), 'Jaccard', .8))
64
65 for $paperS in dataset('DBLP')
66 let $tokensS :=
67 for $word in counthashed-word-tokens($paperS.title)
68 for $token at $i in
69 //
70 // -- - Stage 1 - --
71 //
72 for $paper in dataset('DBLP')
73 for $word in counthashed-word-tokens($paper.title)
74 group by $item := $word with $paper
75 order by count($paper)
76 return $item
77 where $word = $token
78 order by $i
79 return $i
80 for $prefix_tokenS in subset-collection(
81 $tokensS,
82 0,
83 prefix-len(
84 len($tokensS), 'Jaccard', .8))
85
86 where $prefix_tokenR = $prefix_tokenS
87
88 let $sim := similarity(
89 len(counthashed-word-tokens($paperR.title)),
90 $tokensR,
91 len(counthashed-word-tokens($paperS.title)),
92 $tokensS,
93 $prefix_tokenR,
94 'Jaccard',
95 .8)
96 where $sim >= .8 // and $paperR.id != $paperS.id
97 group by $idR := $paperR.id, $idS := $paperS.id with $sim
98 where $idR < $idS
99 return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
100
101for $paperR in dataset('DBLP')
102for $paperS in dataset('DBLP')
103where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
104return {'R': {'dblpid': $paperR.dblpid, 'title': $paperR.title},
105 'S': {'dblpid': $paperS.dblpid, 'title': $paperS.title},
106 'sim': $ridpair.sim}