Blame - asterix-app/src/test/resources/demo0216/110-self-join-dblp.aql - asterixdb

blob: 7171e4320c1bce872945026743ecfd69d649284a [file] [log] [blame]

Ian Maxon	857dc13	2015-09-25 17:13:19 -0700	[diff] [blame]	1	/*
				2	* Licensed to the Apache Software Foundation (ASF) under one
				3	* or more contributor license agreements. See the NOTICE file
				4	* distributed with this work for additional information
				5	* regarding copyright ownership. The ASF licenses this file
				6	* to you under the Apache License, Version 2.0 (the
				7	* "License"); you may not use this file except in compliance
				8	* with the License. You may obtain a copy of the License at
				9	*
				10	* http://www.apache.org/licenses/LICENSE-2.0
				11	*
				12	* Unless required by applicable law or agreed to in writing,
				13	* software distributed under the License is distributed on an
				14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
				15	* KIND, either express or implied. See the License for the
				16	* specific language governing permissions and limitations
				17	* under the License.
				18	*/
vinayakb	38b7ca4	2012-03-05 05:44:15 +0000	[diff] [blame]	19	use dataverse fuzzy1;
				20
				21	declare type DBLPType as open {
				22	id: int32,
				23	dblpid: string,
				24	title: string,
				25	authors: string,
				26	misc: string
				27	}
				28
				29	declare nodegroup group1 on rainbow-01, rainbow-02, rainbow-03,
				30	rainbow-04, rainbow-05;
				31
				32	declare dataset DBLP(DBLPType)
ramangrover29	669d8f6	2013-02-11 06:03:32 +0000	[diff] [blame]	33	primary key id on group1;
vinayakb	38b7ca4	2012-03-05 05:44:15 +0000	[diff] [blame]	34
				35	write output to rainbow-01:"/home/hyracks/dblp-self-join.adm";
				36
				37	//
				38	// -- - Stage 3 - --
				39	//
				40	for $ridpair in
				41	//
				42	// -- - Stage 2 - --
				43	//
				44	for $paperR in dataset('DBLP')
				45	let $tokensR :=
				46	for $word in counthashed-word-tokens($paperR.title)
				47	for $token at $i in
				48	//
				49	// -- - Stage 1 - --
				50	//
				51	for $paper in dataset('DBLP')
				52	for $word in counthashed-word-tokens($paper.title)
				53	group by $item := $word with $paper
				54	order by count($paper)
				55	return $item
				56	where $word = $token
				57	order by $i
				58	return $i
				59	for $prefix_tokenR in subset-collection(
				60	$tokensR,
				61	0,
				62	prefix-len(
				63	len($tokensR), 'Jaccard', .8))
				64
				65	for $paperS in dataset('DBLP')
				66	let $tokensS :=
				67	for $word in counthashed-word-tokens($paperS.title)
				68	for $token at $i in
				69	//
				70	// -- - Stage 1 - --
				71	//
				72	for $paper in dataset('DBLP')
				73	for $word in counthashed-word-tokens($paper.title)
				74	group by $item := $word with $paper
				75	order by count($paper)
				76	return $item
				77	where $word = $token
				78	order by $i
				79	return $i
				80	for $prefix_tokenS in subset-collection(
				81	$tokensS,
				82	0,
				83	prefix-len(
				84	len($tokensS), 'Jaccard', .8))
				85
				86	where $prefix_tokenR = $prefix_tokenS
				87
				88	let $sim := similarity(
				89	len(counthashed-word-tokens($paperR.title)),
				90	$tokensR,
				91	len(counthashed-word-tokens($paperS.title)),
				92	$tokensS,
				93	$prefix_tokenR,
				94	'Jaccard',
				95	.8)
				96	where $sim >= .8 // and $paperR.id != $paperS.id
				97	group by $idR := $paperR.id, $idS := $paperS.id with $sim
				98	where $idR < $idS
				99	return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
				100
				101	for $paperR in dataset('DBLP')
				102	for $paperS in dataset('DBLP')
				103	where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
				104	return {'R': {'dblpid': $paperR.dblpid, 'title': $paperR.title},
				105	'S': {'dblpid': $paperS.dblpid, 'title': $paperS.title},
				106	'sim': $ridpair.sim}