blob: 4ac1a608e44ffbb0eabdf0a39afa9526aafe5391 [file] [log] [blame]
use dataverse fuzzy1;
declare type DBLPType as open {
id: int32,
dblpid: string,
title: string,
authors: string,
misc: string
}
declare nodegroup group1 on nc1, nc2;
declare dataset DBLP(DBLPType)
primary key id on group1;
write output to nc1:'/tmp/dblp.adm';
//
// -- - Stage 3 - --
//
for $ridpair in
//
// -- - Stage 2 - --
//
for $paperR in dataset('DBLP')
let $tokensR :=
for $word in counthashed-word-tokens($paperR.title)
for $token at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
for $word in counthashed-word-tokens($paper.title)
group by $item := $word with $paper
order by count($paper)
return $item
where $word = $token
order by $i
return $i
for $prefix_tokenR in subset-collection(
$tokensR,
0,
prefix-len(
len($tokensR), 'Jaccard', .5))
for $paperS in dataset('DBLP')
let $tokensS :=
for $word in counthashed-word-tokens($paperS.title)
for $token at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
for $word in counthashed-word-tokens($paper.title)
group by $item := $word with $paper
order by count($paper)
return $item
where $word = $token
order by $i
return $i
for $prefix_tokenS in subset-collection(
$tokensS,
0,
prefix-len(
len($tokensS), 'Jaccard', .5))
where $prefix_tokenR = $prefix_tokenS
let $sim := similarity(
len(counthashed-word-tokens($paperR.title)),
$tokensR,
len(counthashed-word-tokens($paperS.title)),
$tokensS,
$prefix_tokenR,
'Jaccard',
.5)
where $sim >= .5 and $paperR.id < $paperS.id
group by $idR := $paperR.id, $idS := $paperS.id with $sim
return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
for $paperR in dataset('DBLP')
for $paperS in dataset('DBLP')
where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
'S': { 'dblpid': $paperS.dblpid, 'title': $paperS.title },
'sim': $ridpair.sim }