blob: 07ccedc08bc538002fb9c18d0e2603becc218196 [file] [log] [blame]
Yingyi Bu391f09e2015-10-29 13:49:39 -07001/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20drop database fuzzyjoin if exists;
21create database fuzzyjoin;
22
23use fuzzyjoin;
24
25
26create type fuzzyjoin.DBLPType as
27{
28 id : int32,
29 dblpid : string,
30 title : string,
31 authors : string,
32 misc : string
33}
34
35create table DBLP(DBLPType) primary key id;
36
37set "import-private-functions" "true";
38
39select element {'left':paperLeft,'right':paperRight,'sim':ridpair.sim}
40from DBLP as paperLeft,
41 DBLP as paperRight,
42 (
43 select element {'idLeft':idLeft,'idRight':idRight,'sim':sim[0]}
44 from DBLP as paperLeft,
45 fuzzyjoin."subset-collection"(tokensLeft,0,fuzzyjoin."prefix-len-jaccard"(lenLeft,0.500000f)) as prefixTokenLeft,
46 DBLP as paperRight,
47 fuzzyjoin."subset-collection"(tokensRight,0,fuzzyjoin."prefix-len-jaccard"(lenRight,0.500000f)) as prefixTokenRight
48 with lenLeft as fuzzyjoin.len(fuzzyjoin."counthashed-word-tokens"(paperLeft.title)),
49 tokensLeft as (
50 select element i
51 from fuzzyjoin."counthashed-word-tokens"(paperLeft.title) as tokenUnranked,
52 (
53 select element tokenGroupped
54 from DBLP as paper,
55 fuzzyjoin."counthashed-word-tokens"(paper.title) as token
56 group by token as tokenGroupped
57 order by fuzzyjoin.count(paper),tokenGroupped
58 ) as tokenRanked at i
59 where (tokenUnranked = tokenRanked)
60 order by i
61 ),
62 lenRight as fuzzyjoin.len(fuzzyjoin."counthashed-word-tokens"(paperRight.title)),
63 tokensRight as (
64 select element i
65 from fuzzyjoin."counthashed-word-tokens"(paperRight.title) as tokenUnranked,
66 (
67 select element tokenGroupped
68 from DBLP as paper,
69 fuzzyjoin."counthashed-word-tokens"(paper.title) as token
70 group by token as tokenGroupped
71 order by fuzzyjoin.count(paper),tokenGroupped
72 ) as tokenRanked at i
73 where (tokenUnranked = tokenRanked)
74 order by i
75 ),
76 sim as fuzzyjoin."similarity-jaccard-prefix"(lenLeft,tokensLeft,lenRight,tokensRight,prefixTokenLeft,0.500000f)
77 where ((prefixTokenLeft = prefixTokenRight) and ((sim >= 0.500000f) and (paperLeft.id < paperRight.id)))
78 group by paperLeft.id as idLeft,paperRight.id as idRight
79) as ridpair
80where ((ridpair.idLeft = paperLeft.id) and (ridpair.idRight = paperRight.id))
81order by paperLeft.id,paperRight.id
82;