blob: c49131b24da27388484ad5389e71c7a64ce88862 [file] [log] [blame]
Ian Maxon857dc132015-09-25 17:13:19 -07001/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
vinayakb38b7ca42012-03-05 05:44:15 +000019use dataverse fuzzy1;
20
21declare type DBLPType as open {
22 id: int32,
23 dblpid: string,
24 title: string,
25 authors: string,
26 misc: string
27}
28
29declare type CSXType as open {
30 id: int32,
31 csxid: string,
32 title: string,
33 authors: string,
34 misc: string
35}
36
Ian Maxonf7b64532015-12-09 17:28:18 -080037declare nodegroup group1 on asterix_nc1, asterix_nc2;
vinayakb38b7ca42012-03-05 05:44:15 +000038
39declare dataset DBLP(DBLPType)
ramangrover29669d8f62013-02-11 06:03:32 +000040 primary key id on group1;
vinayakb38b7ca42012-03-05 05:44:15 +000041
42declare dataset CSX(CSXType)
ramangrover29669d8f62013-02-11 06:03:32 +000043 primary key id on group1;
vinayakb38b7ca42012-03-05 05:44:15 +000044
Ian Maxonf7b64532015-12-09 17:28:18 -080045write output to asterix_nc1:'/tmp/pub.adm';
vinayakb38b7ca42012-03-05 05:44:15 +000046
47//
48// -- - Stage 3 - --
49//
50for $ridpair in
51 //
52 // -- - Stage 2 - --
53 //
54 for $paperR in dataset('DBLP')
55 let $lenR := len(counthashed-word-tokens($paperR.title))
56 let $tokensR :=
57 for $word in counthashed-word-tokens($paperR.title)
58 for $token at $i in
59 //
60 // -- - Stage 1 - --
61 //
62 for $paper in dataset('DBLP')
63 for $word in counthashed-word-tokens($paper.title)
64 group by $item := $word with $paper
65 order by count($paper)
66 return $item
67 where $word = $token
68 order by $i
69 return $i
70 for $prefix_tokenR in subset-collection(
71 $tokensR,
72 0,
73 prefix-len($lenR, 'Jaccard', .5))
74
75 for $paperS in dataset('CSX')
76 let $lenS := len(counthashed-word-tokens($paperS.title))
77 let $tokensS :=
78 for $word in counthashed-word-tokens($paperS.title)
79 for $token at $i in
80 //
81 // -- - Stage 1 - --
82 //
83 for $paper in dataset('DBLP')
84 for $word in counthashed-word-tokens($paper.title)
85 group by $item := $word with $paper
86 order by count($paper)
87 return $item
88 where $word = $token
89 order by $i
90 return $i
91 for $prefix_tokenS in subset-collection(
92 $tokensS,
93 0,
94 prefix-len($lenS, 'Jaccard', .5))
95
96 where $prefix_tokenR = $prefix_tokenS
97
98 let $sim := similarity(
99 $lenR,
100 $tokensR,
101 $lenS,
102 $tokensS,
103 $prefix_tokenR,
104 'Jaccard',
105 .5)
106 where $sim >= .5
107 group by $idR := $paperR.id, $idS := $paperS.id with $sim
108 return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
109
110for $paperR in dataset('DBLP')
111for $paperS in dataset('CSX')
112where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
113return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
114 'S': { 'csxid': $paperS.csxid, 'title': $paperS.title },
115 'sim': $ridpair.sim }