asterixdb/asterix-app/src/test/resources/optimizerts/queries/fj-dblp-csx.aql - asterixdb - Gitiles

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 drop dataverse fj-dblp-csx if exists;

 create dataverse fj-dblp-csx;

 use dataverse fj-dblp-csx;

 create type DBLPType as open {
   id: int32,
   dblpid: string,
   title: string,
   authors: string,
   misc: string
 }

 create type CSXType as open {
   id: int32,
   csxid: string,
   title: string,
   authors: string,
   misc: string
 }

 create nodegroup group1 if not exists on asterix_nc1, asterix_nc2;

 create dataset DBLP(DBLPType) primary key id on group1;
 create dataset CSX(CSXType) primary key id on group1;

 write output to asterix_nc1:'rttest/fj-dblp-csx.adm';

 set import-private-functions 'true'

 for $paperDBLP in dataset('DBLP')
 let $idDBLP := $paperDBLP.id
 let $unrankedTokensDBLP := counthashed-word-tokens($paperDBLP.title)
 let $lenDBLP := len($unrankedTokensDBLP)
 let $tokensDBLP :=
     for $token in $unrankedTokensDBLP
     for $tokenRanked at $i in
         //
         // -- - Stage 1 - --
         //
         for $paper in dataset('DBLP')
         let $id := $paper.id
         for $token in counthashed-word-tokens($paper.title)
         /*+ hash */
         group by $tokenGroupped := $token with $id
         /*+ inmem 16 16384 */
         order by count($id), $tokenGroupped
         return $tokenGroupped
     where $token = /*+ bcast */ $tokenRanked
     order by $i
     return $i
 for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f))

 for $paperCSX in dataset('CSX')
 let $idCSX := $paperCSX.id
 let $unrankedTokensCSX := counthashed-word-tokens($paperCSX.title)
 let $lenCSX := len($unrankedTokensCSX)
 let $tokensCSX :=
     for $token in $unrankedTokensCSX
     for $tokenRanked at $i in
         //
         // -- - Stage 1 - --
         //
         for $paper in dataset('DBLP')
         let $id := $paper.id
         for $token in counthashed-word-tokens($paper.title)
         /*+ hash */
         group by $tokenGroupped := $token with $id
         /*+ inmem 16 16384 */
         order by count($id), $tokenGroupped
         return $tokenGroupped
     where $token = /*+ bcast */ $tokenRanked
     order by $i
     return $i
 let $actualPrefixLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f) - len($unrankedTokensCSX) + len($tokensCSX)
 for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen)

 where $prefixTokenDBLP = $prefixTokenCSX

 /*+ hash */
 group by $idDBLP := $idDBLP, $idCSX := $idCSX with $unrankedTokensDBLP, $unrankedTokensCSX
 let $sim := similarity-jaccard-check($unrankedTokensDBLP[0], $unrankedTokensCSX[0], .8f)
 where $sim[1] >= .8f
 return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	drop dataverse fj-dblp-csx if exists;

	create dataverse fj-dblp-csx;

	use dataverse fj-dblp-csx;

	create type DBLPType as open {
	id: int32,
	dblpid: string,
	title: string,
	authors: string,
	misc: string
	}

	create type CSXType as open {
	id: int32,
	csxid: string,
	title: string,
	authors: string,
	misc: string
	}

	create nodegroup group1 if not exists on asterix_nc1, asterix_nc2;

	create dataset DBLP(DBLPType) primary key id on group1;
	create dataset CSX(CSXType) primary key id on group1;

	write output to asterix_nc1:'rttest/fj-dblp-csx.adm';

	set import-private-functions 'true'

	for $paperDBLP in dataset('DBLP')
	let $idDBLP := $paperDBLP.id
	let $unrankedTokensDBLP := counthashed-word-tokens($paperDBLP.title)
	let $lenDBLP := len($unrankedTokensDBLP)
	let $tokensDBLP :=
	for $token in $unrankedTokensDBLP
	for $tokenRanked at $i in
	//
	// -- - Stage 1 - --
	//
	for $paper in dataset('DBLP')
	let $id := $paper.id
	for $token in counthashed-word-tokens($paper.title)
	/+ hash /
	group by $tokenGroupped := $token with $id
	/+ inmem 16 16384 /
	order by count($id), $tokenGroupped
	return $tokenGroupped
	where $token = /+ bcast / $tokenRanked
	order by $i
	return $i
	for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f))

	for $paperCSX in dataset('CSX')
	let $idCSX := $paperCSX.id
	let $unrankedTokensCSX := counthashed-word-tokens($paperCSX.title)
	let $lenCSX := len($unrankedTokensCSX)
	let $tokensCSX :=
	for $token in $unrankedTokensCSX
	for $tokenRanked at $i in
	//
	// -- - Stage 1 - --
	//
	for $paper in dataset('DBLP')
	let $id := $paper.id
	for $token in counthashed-word-tokens($paper.title)
	/+ hash /
	group by $tokenGroupped := $token with $id
	/+ inmem 16 16384 /
	order by count($id), $tokenGroupped
	return $tokenGroupped
	where $token = /+ bcast / $tokenRanked
	order by $i
	return $i
	let $actualPrefixLen := prefix-len-jaccard(len($unrankedTokensCSX), .8f) - len($unrankedTokensCSX) + len($tokensCSX)
	for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen)

	where $prefixTokenDBLP = $prefixTokenCSX

	/+ hash /
	group by $idDBLP := $idDBLP, $idCSX := $idCSX with $unrankedTokensDBLP, $unrankedTokensCSX
	let $sim := similarity-jaccard-check($unrankedTokensDBLP[0], $unrankedTokensCSX[0], .8f)
	where $sim[1] >= .8f
	return {'idDBLP': $idDBLP, 'idCSX': $idCSX}