asterix-app/src/test/resources/fuzzyjoin/pub/070-join-aql.aql - asterixdb - Gitiles

 use dataverse fuzzy1;

 declare type DBLPType as open {
   id: int32,
   dblpid: string,
   title: string,
   authors: string,
   misc: string
 }

 declare type CSXType as open {
   id: int32,
   csxid: string,
   title: string,
   authors: string,
   misc: string
 }

 declare nodegroup group1 on nc1, nc2;

 declare dataset DBLP(DBLPType)
   primary key id on group1;

 declare dataset CSX(CSXType)
   primary key id on group1;

 write output to nc1:'/tmp/pub.adm';

 //
 // -- - Stage 3 - --
 //
 for $ridpair in
     //
     // -- - Stage 2 - --
     //
     for $paperR in dataset('DBLP')
     let $lenR := len(counthashed-word-tokens($paperR.title))
     let $tokensR :=
         for $word in counthashed-word-tokens($paperR.title)
         for $token at $i in
             //
             // -- - Stage 1 - --
             //
             for $paper in dataset('DBLP')
             for $word in counthashed-word-tokens($paper.title)
             group by $item := $word with $paper
             order by count($paper)
             return $item
         where $word = $token
         order by $i
         return $i
     for $prefix_tokenR in subset-collection(
                                 $tokensR,
                                 0,
                                 prefix-len($lenR, 'Jaccard', .5))

     for $paperS in dataset('CSX')
     let $lenS := len(counthashed-word-tokens($paperS.title))
     let $tokensS :=
         for $word in counthashed-word-tokens($paperS.title)
         for $token at $i in
             //
             // -- - Stage 1 - --
             //
             for $paper in dataset('DBLP')
             for $word in counthashed-word-tokens($paper.title)
             group by $item := $word with $paper
             order by count($paper)
             return $item
         where $word = $token
         order by $i
         return $i
     for $prefix_tokenS in subset-collection(
                                 $tokensS,
                                 0,
                                 prefix-len($lenS, 'Jaccard', .5))

     where $prefix_tokenR = $prefix_tokenS

     let $sim := similarity(
                     $lenR,
                     $tokensR,
                     $lenS,
                     $tokensS,
                     $prefix_tokenR,
                     'Jaccard',
                     .5)
     where $sim >= .5
     group by $idR := $paperR.id, $idS := $paperS.id with $sim
     return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}

 for $paperR in dataset('DBLP')
 for $paperS in dataset('CSX')
 where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
 return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
          'S': { 'csxid': $paperS.csxid, 'title': $paperS.title },
          'sim': $ridpair.sim }
	use dataverse fuzzy1;

	declare type DBLPType as open {
	id: int32,
	dblpid: string,
	title: string,
	authors: string,
	misc: string
	}

	declare type CSXType as open {
	id: int32,
	csxid: string,
	title: string,
	authors: string,
	misc: string
	}

	declare nodegroup group1 on nc1, nc2;

	declare dataset DBLP(DBLPType)
	primary key id on group1;

	declare dataset CSX(CSXType)
	primary key id on group1;

	write output to nc1:'/tmp/pub.adm';

	//
	// -- - Stage 3 - --
	//
	for $ridpair in
	//
	// -- - Stage 2 - --
	//
	for $paperR in dataset('DBLP')
	let $lenR := len(counthashed-word-tokens($paperR.title))
	let $tokensR :=
	for $word in counthashed-word-tokens($paperR.title)
	for $token at $i in
	//
	// -- - Stage 1 - --
	//
	for $paper in dataset('DBLP')
	for $word in counthashed-word-tokens($paper.title)
	group by $item := $word with $paper
	order by count($paper)
	return $item
	where $word = $token
	order by $i
	return $i
	for $prefix_tokenR in subset-collection(
	$tokensR,
	0,
	prefix-len($lenR, 'Jaccard', .5))

	for $paperS in dataset('CSX')
	let $lenS := len(counthashed-word-tokens($paperS.title))
	let $tokensS :=
	for $word in counthashed-word-tokens($paperS.title)
	for $token at $i in
	//
	// -- - Stage 1 - --
	//
	for $paper in dataset('DBLP')
	for $word in counthashed-word-tokens($paper.title)
	group by $item := $word with $paper
	order by count($paper)
	return $item
	where $word = $token
	order by $i
	return $i
	for $prefix_tokenS in subset-collection(
	$tokensS,
	0,
	prefix-len($lenS, 'Jaccard', .5))

	where $prefix_tokenR = $prefix_tokenS

	let $sim := similarity(
	$lenR,
	$tokensR,
	$lenS,
	$tokensS,
	$prefix_tokenR,
	'Jaccard',
	.5)
	where $sim >= .5
	group by $idR := $paperR.id, $idS := $paperS.id with $sim
	return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}

	for $paperR in dataset('DBLP')
	for $paperS in dataset('CSX')
	where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
	return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
	'S': { 'csxid': $paperS.csxid, 'title': $paperS.title },
	'sim': $ridpair.sim }