asterix-app/src/test/resources/demo0216/110-self-join-dblp.aql - asterixdb - Gitiles

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 use dataverse fuzzy1;

 declare type DBLPType as open {
  id: int32,
  dblpid: string,
  title: string,
  authors: string,
  misc: string
 }

 declare nodegroup group1 on rainbow-01, rainbow-02, rainbow-03,
 rainbow-04, rainbow-05;

 declare dataset DBLP(DBLPType)
   primary key id on group1;

 write output to rainbow-01:"/home/hyracks/dblp-self-join.adm";

 //
 // -- - Stage 3 - --
 //
 for $ridpair in
     //
     // -- - Stage 2 - --
     //
     for $paperR in dataset('DBLP')
     let $tokensR :=
         for $word in counthashed-word-tokens($paperR.title)
         for $token at $i in
             //
             // -- - Stage 1 - --
             //
             for $paper in dataset('DBLP')
             for $word in counthashed-word-tokens($paper.title)
             group by $item := $word with $paper
             order by count($paper)
             return $item
         where $word = $token
         order by $i
         return $i
     for $prefix_tokenR in subset-collection(
                                 $tokensR,
                                 0,
                                 prefix-len(
                                     len($tokensR), 'Jaccard', .8))

     for $paperS in dataset('DBLP')
     let $tokensS :=
         for $word in counthashed-word-tokens($paperS.title)
         for $token at $i in
             //
             // -- - Stage 1 - --
             //
             for $paper in dataset('DBLP')
             for $word in counthashed-word-tokens($paper.title)
             group by $item := $word with $paper
             order by count($paper)
             return $item
         where $word = $token
         order by $i
         return $i
     for $prefix_tokenS in subset-collection(
                                 $tokensS,
                                 0,
                                 prefix-len(
                                     len($tokensS), 'Jaccard', .8))

     where $prefix_tokenR = $prefix_tokenS

     let $sim := similarity(
                     len(counthashed-word-tokens($paperR.title)),
                     $tokensR,
                     len(counthashed-word-tokens($paperS.title)),
                     $tokensS,
                     $prefix_tokenR,
                     'Jaccard',
                     .8)
     where $sim >= .8 // and $paperR.id != $paperS.id
     group by $idR := $paperR.id, $idS := $paperS.id with $sim
     where $idR < $idS
     return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}

 for $paperR in dataset('DBLP')
 for $paperS in dataset('DBLP')
 where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
 return {'R': {'dblpid': $paperR.dblpid, 'title': $paperR.title},
         'S': {'dblpid': $paperS.dblpid, 'title': $paperS.title},
         'sim': $ridpair.sim}
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	use dataverse fuzzy1;

	declare type DBLPType as open {
	id: int32,
	dblpid: string,
	title: string,
	authors: string,
	misc: string
	}

	declare nodegroup group1 on rainbow-01, rainbow-02, rainbow-03,
	rainbow-04, rainbow-05;

	declare dataset DBLP(DBLPType)
	primary key id on group1;

	write output to rainbow-01:"/home/hyracks/dblp-self-join.adm";

	//
	// -- - Stage 3 - --
	//
	for $ridpair in
	//
	// -- - Stage 2 - --
	//
	for $paperR in dataset('DBLP')
	let $tokensR :=
	for $word in counthashed-word-tokens($paperR.title)
	for $token at $i in
	//
	// -- - Stage 1 - --
	//
	for $paper in dataset('DBLP')
	for $word in counthashed-word-tokens($paper.title)
	group by $item := $word with $paper
	order by count($paper)
	return $item
	where $word = $token
	order by $i
	return $i
	for $prefix_tokenR in subset-collection(
	$tokensR,
	0,
	prefix-len(
	len($tokensR), 'Jaccard', .8))

	for $paperS in dataset('DBLP')
	let $tokensS :=
	for $word in counthashed-word-tokens($paperS.title)
	for $token at $i in
	//
	// -- - Stage 1 - --
	//
	for $paper in dataset('DBLP')
	for $word in counthashed-word-tokens($paper.title)
	group by $item := $word with $paper
	order by count($paper)
	return $item
	where $word = $token
	order by $i
	return $i
	for $prefix_tokenS in subset-collection(
	$tokensS,
	0,
	prefix-len(
	len($tokensS), 'Jaccard', .8))

	where $prefix_tokenR = $prefix_tokenS

	let $sim := similarity(
	len(counthashed-word-tokens($paperR.title)),
	$tokensR,
	len(counthashed-word-tokens($paperS.title)),
	$tokensS,
	$prefix_tokenR,
	'Jaccard',
	.8)
	where $sim >= .8 // and $paperR.id != $paperS.id
	group by $idR := $paperR.id, $idS := $paperS.id with $sim
	where $idR < $idS
	return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}

	for $paperR in dataset('DBLP')
	for $paperS in dataset('DBLP')
	where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
	return {'R': {'dblpid': $paperR.dblpid, 'title': $paperR.title},
	'S': {'dblpid': $paperS.dblpid, 'title': $paperS.title},
	'sim': $ridpair.sim}