blob: 47533a7dca793fee5e495ea0ea0eec86f286d375 [file] [log] [blame]
Ian Maxon857dc132015-09-25 17:13:19 -07001/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
vinayakb38b7ca42012-03-05 05:44:15 +000019drop dataverse fuzzyjoin_078 if exists;
20
21create dataverse fuzzyjoin_078;
22
23use dataverse fuzzyjoin_078;
24
25create type DBLPType as open {
26 id: int32,
27 dblpid: string,
28 title: string,
29 authors: string,
30 misc: string
31}
32
Ian Maxonf7b64532015-12-09 17:28:18 -080033create nodegroup group1 if not exists on asterix_nc1, asterix_nc2;
vinayakb38b7ca42012-03-05 05:44:15 +000034
35create dataset DBLP_fuzzyjoin_078(DBLPType)
ramangrover29669d8f62013-02-11 06:03:32 +000036 primary key id on group1;
vinayakb38b7ca42012-03-05 05:44:15 +000037
Ian Maxonf7b64532015-12-09 17:28:18 -080038write output to asterix_nc1:'rttest/fuzzyjoin_078.adm';
vinayakb38b7ca42012-03-05 05:44:15 +000039
40 //
41 // -- - Stage 2 - --
42 //
43 for $paperDBLP in dataset('DBLP_fuzzyjoin_078')
44 let $unrankedTokensDBLP := counthashed-word-tokens($paperDBLP.title)
45 let $tokensDBLP :=
46 for $token in $unrankedTokensDBLP
47 for $tokenRanked at $i in
48 //
49 // -- - Stage 1 - --
50 //
51 for $paper in dataset('DBLP_fuzzyjoin_078')
52 for $token in counthashed-word-tokens($paper.title)
53 /*+ hash */
54 group by $tokenGroupped := $token with $paper
55 /*+ inmem 1 302 */
56 order by count($paper), $tokenGroupped
57 return $tokenGroupped
58 where $token = /*+ bcast */ $tokenRanked
59 order by $i
60 return $i
61 order by $paperDBLP.id
62 return {'id': $paperDBLP.id, 'tokens':$tokensDBLP}