Blame - asterix-app/src/test/resources/fuzzyjoin/dblp/040-self-join-aql.aql - asterixdb

blob: f41892119e795be44a3d70987250d3dff1bfee20 [file] [log] [blame]

Ian Maxon	857dc13	2015-09-25 17:13:19 -0700	[diff] [blame^]	1	/*
				2	* Licensed to the Apache Software Foundation (ASF) under one
				3	* or more contributor license agreements. See the NOTICE file
				4	* distributed with this work for additional information
				5	* regarding copyright ownership. The ASF licenses this file
				6	* to you under the Apache License, Version 2.0 (the
				7	* "License"); you may not use this file except in compliance
				8	* with the License. You may obtain a copy of the License at
				9	*
				10	* http://www.apache.org/licenses/LICENSE-2.0
				11	*
				12	* Unless required by applicable law or agreed to in writing,
				13	* software distributed under the License is distributed on an
				14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
				15	* KIND, either express or implied. See the License for the
				16	* specific language governing permissions and limitations
				17	* under the License.
				18	*/
vinayakb	38b7ca4	2012-03-05 05:44:15 +0000	[diff] [blame]	19	use dataverse fuzzy1;
				20
				21	declare type DBLPType as open {
				22	id: int32,
				23	dblpid: string,
				24	title: string,
				25	authors: string,
				26	misc: string
				27	}
				28
				29	declare nodegroup group1 on nc1, nc2;
				30
				31	declare dataset DBLP(DBLPType)
ramangrover29	669d8f6	2013-02-11 06:03:32 +0000	[diff] [blame]	32	primary key id on group1;
vinayakb	38b7ca4	2012-03-05 05:44:15 +0000	[diff] [blame]	33
				34	write output to nc1:'/tmp/dblp.adm';
				35
				36	//
				37	// -- - Stage 3 - --
				38	//
				39	for $ridpair in
				40	//
				41	// -- - Stage 2 - --
				42	//
				43	for $paperR in dataset('DBLP')
				44	let $tokensR :=
				45	for $word in counthashed-word-tokens($paperR.title)
				46	for $token at $i in
				47	//
				48	// -- - Stage 1 - --
				49	//
				50	for $paper in dataset('DBLP')
				51	for $word in counthashed-word-tokens($paper.title)
				52	group by $item := $word with $paper
				53	order by count($paper)
				54	return $item
				55	where $word = $token
				56	order by $i
				57	return $i
				58	for $prefix_tokenR in subset-collection(
				59	$tokensR,
				60	0,
				61	prefix-len(
				62	len($tokensR), 'Jaccard', .5))
				63
				64	for $paperS in dataset('DBLP')
				65	let $tokensS :=
				66	for $word in counthashed-word-tokens($paperS.title)
				67	for $token at $i in
				68	//
				69	// -- - Stage 1 - --
				70	//
				71	for $paper in dataset('DBLP')
				72	for $word in counthashed-word-tokens($paper.title)
				73	group by $item := $word with $paper
				74	order by count($paper)
				75	return $item
				76	where $word = $token
				77	order by $i
				78	return $i
				79	for $prefix_tokenS in subset-collection(
				80	$tokensS,
				81	0,
				82	prefix-len(
				83	len($tokensS), 'Jaccard', .5))
				84
				85	where $prefix_tokenR = $prefix_tokenS
				86
				87	let $sim := similarity(
				88	len(counthashed-word-tokens($paperR.title)),
				89	$tokensR,
				90	len(counthashed-word-tokens($paperS.title)),
				91	$tokensS,
				92	$prefix_tokenR,
				93	'Jaccard',
				94	.5)
				95	where $sim >= .5 and $paperR.id < $paperS.id
				96	group by $idR := $paperR.id, $idS := $paperS.id with $sim
				97	return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
				98
				99	for $paperR in dataset('DBLP')
				100	for $paperS in dataset('DBLP')
				101	where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
				102	return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
				103	'S': { 'dblpid': $paperS.dblpid, 'title': $paperS.title },
				104	'sim': $ridpair.sim }