blob: c49131b24da27388484ad5389e71c7a64ce88862 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
use dataverse fuzzy1;
declare type DBLPType as open {
id: int32,
dblpid: string,
title: string,
authors: string,
misc: string
}
declare type CSXType as open {
id: int32,
csxid: string,
title: string,
authors: string,
misc: string
}
declare nodegroup group1 on asterix_nc1, asterix_nc2;
declare dataset DBLP(DBLPType)
primary key id on group1;
declare dataset CSX(CSXType)
primary key id on group1;
write output to asterix_nc1:'/tmp/pub.adm';
//
// -- - Stage 3 - --
//
for $ridpair in
//
// -- - Stage 2 - --
//
for $paperR in dataset('DBLP')
let $lenR := len(counthashed-word-tokens($paperR.title))
let $tokensR :=
for $word in counthashed-word-tokens($paperR.title)
for $token at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
for $word in counthashed-word-tokens($paper.title)
group by $item := $word with $paper
order by count($paper)
return $item
where $word = $token
order by $i
return $i
for $prefix_tokenR in subset-collection(
$tokensR,
0,
prefix-len($lenR, 'Jaccard', .5))
for $paperS in dataset('CSX')
let $lenS := len(counthashed-word-tokens($paperS.title))
let $tokensS :=
for $word in counthashed-word-tokens($paperS.title)
for $token at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
for $word in counthashed-word-tokens($paper.title)
group by $item := $word with $paper
order by count($paper)
return $item
where $word = $token
order by $i
return $i
for $prefix_tokenS in subset-collection(
$tokensS,
0,
prefix-len($lenS, 'Jaccard', .5))
where $prefix_tokenR = $prefix_tokenS
let $sim := similarity(
$lenR,
$tokensR,
$lenS,
$tokensS,
$prefix_tokenR,
'Jaccard',
.5)
where $sim >= .5
group by $idR := $paperR.id, $idS := $paperS.id with $sim
return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
for $paperR in dataset('DBLP')
for $paperS in dataset('CSX')
where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
'S': { 'csxid': $paperS.csxid, 'title': $paperS.title },
'sim': $ridpair.sim }