Blame - asterix-app/src/test/resources/fuzzyjoin/pub/070-join-aql.aql - asterixdb

blob: ec8867988d49f2fb45636b19109767e908eb9299 [file] [log] [blame]

vinayakb	38b7ca4	2012-03-05 05:44:15 +0000	[diff] [blame]	1	use dataverse fuzzy1;
				2
				3	declare type DBLPType as open {
				4	id: int32,
				5	dblpid: string,
				6	title: string,
				7	authors: string,
				8	misc: string
				9	}
				10
				11	declare type CSXType as open {
				12	id: int32,
				13	csxid: string,
				14	title: string,
				15	authors: string,
				16	misc: string
				17	}
				18
				19	declare nodegroup group1 on nc1, nc2;
				20
				21	declare dataset DBLP(DBLPType)
				22	partitioned by key id on group1;
				23
				24	declare dataset CSX(CSXType)
				25	partitioned by key id on group1;
				26
				27	write output to nc1:'/tmp/pub.adm';
				28
				29	//
				30	// -- - Stage 3 - --
				31	//
				32	for $ridpair in
				33	//
				34	// -- - Stage 2 - --
				35	//
				36	for $paperR in dataset('DBLP')
				37	let $lenR := len(counthashed-word-tokens($paperR.title))
				38	let $tokensR :=
				39	for $word in counthashed-word-tokens($paperR.title)
				40	for $token at $i in
				41	//
				42	// -- - Stage 1 - --
				43	//
				44	for $paper in dataset('DBLP')
				45	for $word in counthashed-word-tokens($paper.title)
				46	group by $item := $word with $paper
				47	order by count($paper)
				48	return $item
				49	where $word = $token
				50	order by $i
				51	return $i
				52	for $prefix_tokenR in subset-collection(
				53	$tokensR,
				54	0,
				55	prefix-len($lenR, 'Jaccard', .5))
				56
				57	for $paperS in dataset('CSX')
				58	let $lenS := len(counthashed-word-tokens($paperS.title))
				59	let $tokensS :=
				60	for $word in counthashed-word-tokens($paperS.title)
				61	for $token at $i in
				62	//
				63	// -- - Stage 1 - --
				64	//
				65	for $paper in dataset('DBLP')
				66	for $word in counthashed-word-tokens($paper.title)
				67	group by $item := $word with $paper
				68	order by count($paper)
				69	return $item
				70	where $word = $token
				71	order by $i
				72	return $i
				73	for $prefix_tokenS in subset-collection(
				74	$tokensS,
				75	0,
				76	prefix-len($lenS, 'Jaccard', .5))
				77
				78	where $prefix_tokenR = $prefix_tokenS
				79
				80	let $sim := similarity(
				81	$lenR,
				82	$tokensR,
				83	$lenS,
				84	$tokensS,
				85	$prefix_tokenR,
				86	'Jaccard',
				87	.5)
				88	where $sim >= .5
				89	group by $idR := $paperR.id, $idS := $paperS.id with $sim
				90	return {'idR': $idR, 'idS': $idS, 'sim': $sim[0]}
				91
				92	for $paperR in dataset('DBLP')
				93	for $paperS in dataset('CSX')
				94	where $ridpair.idR = $paperR.id and $ridpair.idS = $paperS.id
				95	return { 'R': { 'dblpid': $paperR.dblpid, 'title': $paperR.title },
				96	'S': { 'csxid': $paperS.csxid, 'title': $paperS.title },
				97	'sim': $ridpair.sim }