Blame - asterix-app/src/test/resources/fuzzyjoin/dblp/040-self-join-aql.aql - asterixdb

blob: 63e1a1efdbb787e669af5c724802de4a0392cada [file] [log] [blame]

vinayakb	38b7ca4	2012-03-05 05:44:15 +0000	[diff] [blame^]	1	use dataverse fuzzy1;
				2
				3	declare type DBLPType as open {
				4	id: int32,
				5	dblpid: string,
				6	title: string,
				7	authors: string,
				8	misc: string
				9	}
				10
				11	declare nodegroup group1 on nc1, nc2;
				12
				13	declare dataset DBLP(DBLPType)
				14	partitioned by key id on group1;
				15
				16	write output to nc1:'/tmp/dblp.adm';
				17
				18	//
				19	// -- - Stage 3 - --
				20	//
				21	for $ridpair in
				22	//
				23	// -- - Stage 2 - --
				24	//
				25	for $paperR in dataset('DBLP')
				26	let $tokensR :=
				27	for $word in counthashed-word-tokens($paperR.title)
				28	for $token at $i in
				29	//
				30	// -- - Stage 1 - --
				31	//
				32	for $paper in dataset('DBLP')
				33	for $word in counthashed-word-tokens($paper.title)
				34	group by $item := $word with $paper
				35	order by count($paper)
				36	return $item
				37	where $word = $token
				38	order by $i
				39	return $i
				40	for $prefix_tokenR in subset-collection(
				41	$tokensR,
				42	0,
				43	prefix-len(
				44	len($tokensR), 'Jaccard', .5))
				45
				46	for $paperS in dataset('DBLP')
				47	let $tokensS :=
				48	for $word in counthashed-word-tokens($paperS.title)
				49	for $token at $i in
				50	//
				51	// -- - Stage 1 - --
				52	//
				53	for $paper in dataset('DBLP')
				54	for $word in counthashed-word-tokens($paper.title)
				55	group by $item := $word with $paper
				56	order by count($paper)
				57	return $item
				58	where $word = $token
				59	order by $i
				60	return $i
				61	for $prefix_tokenS in subset-collection(
				62	$tokensS,
				63	0,
				64	prefix-len(
				65	len($tokensS), 'Jaccard', .5))
				66
				67	where $prefix_tokenR = $prefix_tokenS
				68
				69	let $sim := similarity(
				70	len(counthashed-word-tokens($paperR.title)),
				71	$tokensR,
				72	len(counthashed-word-tokens($paperS.title)),
				73	$tokensS,
				74	$prefix_tokenR,
				75	'Jaccard',
				76	.5)
				77	where $sim >= .5 and $paperR.id < $paperS.id
				78	group by $idR := $paperR.id, $idS := $paperS.id with $sim
				79	return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
				80
				81	for $paperR in dataset('DBLP')
				82	for $paperS in dataset('DBLP')
				83	where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
				84	return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
				85	'S': { 'dblpid': $paperS.dblpid, 'title': $paperS.title },
				86	'sim': $ridpair.sim }