Blame - asterix-app/src/test/resources/optimizerts/queries/split-materialization-above-join.aql - asterixdb

blob: fb9e8353bfdce5bc5504da1aab306742bea4bff6 [file] [log] [blame]

icetindil	e8fd4cc	2014-08-05 20:25:16 -0700	[diff] [blame]	1	drop dataverse fuzzyjoin if exists;
				2
				3	create dataverse fuzzyjoin;
				4
				5	use dataverse fuzzyjoin;
				6
				7	create type DBLPType as open {
				8	id: int32,
				9	dblpid: string,
				10	title: string,
				11	authors: string,
				12	misc: string
				13	}
				14
				15	create dataset DBLP(DBLPType) primary key id;
				16
				17	set import-private-functions 'true';
				18
				19	//
				20	// -- - Stage 3 - --
				21	//
				22	for $paperLeft in dataset('DBLP')
				23	for $paperRight in dataset('DBLP')
				24	for $ridpair in
				25	//
				26	// -- - Stage 2 - --
				27	//
				28	for $paperLeft in dataset('DBLP')
				29	let $lenLeft := len(counthashed-word-tokens($paperLeft.title))
				30	let $tokensLeft :=
				31	for $tokenUnranked in counthashed-word-tokens($paperLeft.title)
				32	for $tokenRanked at $i in
				33	//
				34	// -- - Stage 1 - --
				35	//
				36	for $paper in dataset('DBLP')
				37	for $token in counthashed-word-tokens($paper.title)
				38	group by $tokenGroupped := $token with $paper
				39	order by count($paper), $tokenGroupped
				40	return $tokenGroupped
				41	where $tokenUnranked = $tokenRanked
				42	order by $i
				43	return $i
				44	for $prefixTokenLeft in subset-collection(
				45	$tokensLeft,
				46	0,
				47	prefix-len-jaccard($lenLeft, .5f))
				48
				49	for $paperRight in dataset('DBLP')
				50	let $lenRight := len(counthashed-word-tokens($paperRight.title))
				51	let $tokensRight :=
				52	for $tokenUnranked in counthashed-word-tokens($paperRight.title)
				53	for $tokenRanked at $i in
				54	//
				55	// -- - Stage 1 - --
				56	//
				57	for $paper in dataset('DBLP')
				58	for $token in counthashed-word-tokens($paper.title)
				59	group by $tokenGroupped := $token with $paper
				60	order by count($paper), $tokenGroupped
				61	return $tokenGroupped
				62	where $tokenUnranked = $tokenRanked
				63	order by $i
				64	return $i
				65	for $prefixTokenRight in subset-collection(
				66	$tokensRight,
				67	0,
				68	prefix-len-jaccard($lenRight, .5f))
				69
				70	where $prefixTokenLeft = $prefixTokenRight
				71
				72	let $sim := similarity-jaccard-prefix(
				73	$lenLeft,
				74	$tokensLeft,
				75	$lenRight,
				76	$tokensRight,
				77	$prefixTokenLeft,
				78	.5f)
				79	where $sim >= .5f and $paperLeft.id < $paperRight.id
				80	group by $idLeft := $paperLeft.id, $idRight := $paperRight.id with $sim
				81	return {'idLeft': $idLeft, 'idRight': $idRight, 'sim': $sim[0]}
				82
				83	where $ridpair.idLeft = $paperLeft.id and $ridpair.idRight = $paperRight.id
				84	order by $paperLeft.id, $paperRight.id
				85	return {'left': $paperLeft, 'right': $paperRight, 'sim': $ridpair.sim}