added simple inverted index based on btree. conjunctive queries supported. git-svn-id: https://hyracks.googlecode.com/svn/trunk/hyracks@205 123451ca-8445-de46-9d55-352943316053

commit: da0075823940f922d9e0cf0bfae8689a61ab09cd [log] [tgz]
author: alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053> Thu Nov 18 08:20:11 2010 +0000
committer: alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053> Thu Nov 18 08:20:11 2010 +0000
tree: 66f92b1a53792a0ae0b141d8c75703f5a453959a
parent: 414d50961b5e154afba05b054357c6e712114ae5 [diff]
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
new file mode 100644
index 0000000..5df3bb5
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/pom.xml

@@ -0,0 +1,70 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>edu.uci.ics.hyracks</groupId>
+  <artifactId>hyracks-storage-am-invertedindex</artifactId>
+  <version>0.1.3-SNAPSHOT</version>
+
+  <parent>
+    <groupId>edu.uci.ics.hyracks</groupId>
+    <artifactId>hyracks</artifactId>
+    <version>0.1.3-SNAPSHOT</version>
+  </parent>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>2.0.2</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-storage-common</artifactId>
+  		<version>0.1.3-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>compile</scope>
+  	</dependency>  	
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-dataflow-common</artifactId>
+  		<version>0.1.3-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>compile</scope>
+  	</dependency>  	
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-dataflow-std</artifactId>
+  		<version>0.1.3-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>compile</scope>
+  	</dependency>  	
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-control-nc</artifactId>
+  		<version>0.1.3-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>compile</scope>
+  	</dependency>  	
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-storage-am-btree</artifactId>
+  		<version>0.1.3-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>compile</scope>
+  	</dependency>  	
+  	<dependency>
+  		<groupId>junit</groupId>
+  		<artifactId>junit</artifactId>
+  		<version>4.8.1</version>
+  		<type>jar</type>
+  		<scope>test</scope>
+  	</dependency>  	  		
+  </dependencies>
+</project>

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
new file mode 100644
index 0000000..b01025b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java

@@ -0,0 +1,20 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+
+public interface IBinaryTokenizer {
+	
+	public void reset(byte[] data, int start, int length);	
+	public boolean hasNext();	
+	public void next();
+	
+	public int getTokenStartOff();
+	public int getTokenLength();
+	
+	public void writeToken(DataOutput dos) throws IOException;
+	
+	public RecordDescriptor getTokenSchema();
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..e1072ae
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java

@@ -0,0 +1,7 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+	public IBinaryTokenizer createBinaryTokenizer();
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
new file mode 100644
index 0000000..378c925a
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java

@@ -0,0 +1,10 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import java.nio.ByteBuffer;
+
+public interface IInvertedIndexResultCursor {		
+	public boolean hasNext();
+	public void next();	
+	public ByteBuffer getBuffer();
+	public void reset();
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
new file mode 100644
index 0000000..fc2dfd1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java

@@ -0,0 +1,9 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public interface IInvertedIndexSearcher {			
+	public void search(ITupleReference queryTuple, int queryFieldIndex) throws HyracksDataException;
+	public IInvertedIndexResultCursor getResultCursor();
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
new file mode 100644
index 0000000..0d5dbc4
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java

@@ -0,0 +1,42 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+
+public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
+	
+	private static final long serialVersionUID = 1L;
+	
+	private final IBinaryTokenizerFactory tokenizerFactory;
+	// fields that will be tokenized
+	private final int[] tokenFields;
+	// operator will emit these projected fields for each token, e.g., as payload for an inverted list
+	// WARNING: too many projected fields can cause significant data blowup
+	private final int[] projFields;
+	
+	public BinaryTokenizerOperatorDescriptor(JobSpecification spec, RecordDescriptor recDesc, IBinaryTokenizerFactory tokenizerFactory, int[] tokenFields, int[] projFields) {
+		super(spec, 1, 1);
+		this.tokenizerFactory = tokenizerFactory;
+		this.tokenFields = tokenFields;
+		this.projFields = projFields;
+		recordDescriptors[0] = recDesc;
+	}
+	
+	@Override
+	public IOperatorNodePushable createPushRuntime(IHyracksContext ctx,
+			IOperatorEnvironment env,
+			IRecordDescriptorProvider recordDescProvider, int partition,
+			int nPartitions) throws HyracksDataException {
+		return new BinaryTokenizerOperatorNodePushable(ctx, 
+				recordDescProvider.getInputRecordDescriptor(odId, 0), 
+				recordDescriptors[0], tokenizerFactory.createBinaryTokenizer(), 
+				tokenFields, projFields);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
new file mode 100644
index 0000000..869b0be
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java

@@ -0,0 +1,103 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+
+public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
+
+	private final IHyracksContext ctx;
+	private final IBinaryTokenizer tokenizer;
+	private final int[] tokenFields;
+	private final int[] projFields;
+	private final RecordDescriptor inputRecDesc;
+	private final RecordDescriptor outputRecDesc;
+	
+	private FrameTupleAccessor accessor;
+	private ArrayTupleBuilder builder;
+	private DataOutput builderDos;
+	private FrameTupleAppender appender;
+	private ByteBuffer writeBuffer;
+	
+	public BinaryTokenizerOperatorNodePushable(IHyracksContext ctx, RecordDescriptor inputRecDesc, RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, int[] tokenFields, int[] projFields) {
+		this.ctx = ctx;
+		this.tokenizer = tokenizer;
+		this.tokenFields = tokenFields;
+		this.projFields = projFields;
+		this.inputRecDesc = inputRecDesc;
+		this.outputRecDesc = outputRecDesc;
+	}
+	
+	@Override
+	public void open() throws HyracksDataException {
+		accessor = new FrameTupleAccessor(ctx, inputRecDesc);
+        writeBuffer = ctx.getResourceManager().allocateFrame();
+        builder = new ArrayTupleBuilder(outputRecDesc.getFields().length);
+        builderDos = builder.getDataOutput();
+        appender = new FrameTupleAppender(ctx);
+        appender.reset(writeBuffer, true);
+	}
+	
+	@Override
+	public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
+		accessor.reset(buffer);				
+		
+        int tupleCount = accessor.getTupleCount();
+        for (int i = 0; i < tupleCount; i++) {
+        	
+        	for(int j = 0; j < tokenFields.length; j++) {
+        		
+        		tokenizer.reset(accessor.getBuffer().array(), 
+        				accessor.getTupleStartOffset(i) + accessor.getFieldSlotsLength() + accessor.getFieldStartOffset(i, tokenFields[j]), 
+        				accessor.getFieldLength(i, tokenFields[j]));
+        		
+        		while(tokenizer.hasNext()) {        			
+        			tokenizer.next();
+        	
+        			builder.reset();
+        			try {
+						tokenizer.writeToken(builderDos);
+						builder.addFieldEndOffset();
+					} catch (IOException e) {
+						throw new HyracksDataException(e.getMessage());
+					}
+        			
+					for(int k = 0; k < projFields.length; k++) {
+						builder.addField(accessor, i, projFields[k]);
+					}
+					
+					if (!appender.append(builder.getFieldEndOffsets(), builder.getByteArray(), 0, builder.getSize())) {
+		    			FrameUtils.flushFrame(writeBuffer, writer);
+		    			appender.reset(writeBuffer, true);
+		    			if (!appender.append(builder.getFieldEndOffsets(), builder.getByteArray(), 0, builder.getSize())) {
+		    				throw new IllegalStateException();
+		    			}
+		    		}
+        		}
+        	}         	        	
+        }
+        
+        if (appender.getTupleCount() > 0) {
+			FrameUtils.flushFrame(writeBuffer, writer);
+		}
+	}
+	
+	@Override
+	public void close() throws HyracksDataException {
+		writer.close();		
+	}
+	
+	@Override
+	public void flush() throws HyracksDataException {
+	}	
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
new file mode 100644
index 0000000..44bf9f5
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java

@@ -0,0 +1,40 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+
+public class ListResultCursor implements IInvertedIndexResultCursor {
+
+	private List<ByteBuffer> resultBuffers;
+	private int numResultBuffers;
+	private int currentPos = -1;
+	
+	public void setResults(List<ByteBuffer> resultBuffers, int numResultBuffers) {
+		this.resultBuffers = resultBuffers;
+		this.numResultBuffers = numResultBuffers;
+		reset();
+	}	
+		
+	@Override
+	public boolean hasNext() {
+		if(currentPos < numResultBuffers) return true;
+		else return false;		
+	}
+
+	@Override
+	public void next() {		
+		currentPos++;
+	}
+	
+	@Override
+	public ByteBuffer getBuffer() {
+		return resultBuffers.get(currentPos);
+	}
+	
+	@Override
+	public void reset() {
+		currentPos = -1;		
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
new file mode 100644
index 0000000..ec45deb
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java

@@ -0,0 +1,265 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
+
+public class SimpleConjunctiveSearcher implements IInvertedIndexSearcher {
+		
+	private final int numKeyFields;
+	private final int numValueFields;	
+	
+	private final IBinaryComparator[] keyCmps;
+	private final IBinaryComparator[] valueCmps;
+	
+	private final BTree btree;
+	private final IHyracksContext ctx;
+	private final ArrayTupleBuilder resultTupleBuilder;
+	private final FrameTupleAppender resultTupleAppender;
+	private final FrameTupleAccessor resultFrameAccessor;		
+	
+	private List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
+	private List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();		
+	private List<ByteBuffer> swap = null;	
+	private final ListResultCursor resultCursor = new ListResultCursor();
+	private int maxResultBufIdx = 0;
+		
+	private final IBTreeLeafFrame leafFrame;
+	private final IBTreeInteriorFrame interiorFrame;
+	private final IBTreeCursor btreeCursor;
+	private final FrameTupleReference searchKey = new FrameTupleReference();
+	private final RangePredicate pred = new RangePredicate(true, null, null, null);	
+	
+	private final IBinaryTokenizer queryTokenizer;
+	
+	public SimpleConjunctiveSearcher(IHyracksContext ctx, BTree btree, RecordDescriptor btreeRecDesc, IBinaryTokenizer queryTokenizer, int numKeyFields, int numValueFields) {
+		this.ctx = ctx;
+		this.btree = btree;
+		this.queryTokenizer = queryTokenizer;
+		this.numKeyFields = numKeyFields;
+		this.numValueFields = numValueFields;		
+		
+		leafFrame = btree.getLeafFrameFactory().getFrame();
+		interiorFrame = btree.getInteriorFrameFactory().getFrame();
+		btreeCursor = new RangeSearchCursor(leafFrame);
+		resultTupleAppender = new FrameTupleAppender(ctx);
+		resultTupleBuilder = new ArrayTupleBuilder(numValueFields);
+		newResultBuffers.add(ctx.getResourceManager().allocateFrame());
+		prevResultBuffers.add(ctx.getResourceManager().allocateFrame());
+
+		MultiComparator btreeCmp = btree.getMultiComparator();
+				
+		keyCmps = new IBinaryComparator[numKeyFields];		
+		for(int i = 0; i < numKeyFields; i++) {
+			keyCmps[i] = btreeCmp.getComparators()[i];
+		}		
+				
+		valueCmps = new IBinaryComparator[numValueFields];		
+		for(int i = 0; i < numValueFields; i++) {
+			valueCmps[i] = btreeCmp.getComparators()[numKeyFields + i];
+		}
+		
+		MultiComparator searchCmp = new MultiComparator(btreeCmp.getTypeTraits(), keyCmps);
+		pred.setComparator(searchCmp);
+		pred.setLowKey(searchKey);
+		pred.setHighKey(searchKey);
+				
+		ISerializerDeserializer[] valueSerde = new ISerializerDeserializer[numValueFields];
+		for(int i = 0; i < numValueFields; i++) {
+			valueSerde[i] = btreeRecDesc.getFields()[numKeyFields + i];
+			
+		}
+		RecordDescriptor valueRecDesc = new RecordDescriptor(valueSerde);				
+		resultFrameAccessor = new FrameTupleAccessor(ctx, valueRecDesc);
+	}
+	
+	public void search(ITupleReference queryTuple, int queryFieldIndex) throws HyracksDataException {
+		
+		// parse query, TODO: this parsing is too simple
+		RecordDescriptor queryTokenRecDesc = new RecordDescriptor(new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE } );
+		
+		ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
+		DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
+		FrameTupleAppender queryTokenAppender = new FrameTupleAppender(ctx);
+		ByteBuffer queryTokenFrame = ctx.getResourceManager().allocateFrame();
+		queryTokenAppender.reset(queryTokenFrame, true);				
+		
+		queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex), queryTuple.getFieldLength(queryFieldIndex));
+		while(queryTokenizer.hasNext()) {
+			queryTokenizer.next();
+			
+			queryTokenBuilder.reset();			
+			try {
+				queryTokenizer.writeToken(queryTokenDos);
+				queryTokenBuilder.addFieldEndOffset();		
+			} catch (IOException e) {
+				throw new HyracksDataException(e);
+			}
+			
+			// WARNING: assuming one frame is enough to hold all tokens
+			queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0, queryTokenBuilder.getSize());
+		}
+		
+		FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx, queryTokenRecDesc);
+		queryTokenAccessor.reset(queryTokenFrame);
+		int numQueryTokens = queryTokenAccessor.getTupleCount();		
+		
+		maxResultBufIdx = 0;
+				
+		resultTupleAppender.reset(newResultBuffers.get(0), true);
+		try {			
+			// append first inverted list to temporary results
+			searchKey.reset(queryTokenAccessor, 0);
+			btree.search(btreeCursor, pred, leafFrame, interiorFrame);
+			while(btreeCursor.hasNext()) {
+				btreeCursor.next();
+				maxResultBufIdx = appendTupleToNewResults(btreeCursor, maxResultBufIdx);
+			}
+			btreeCursor.reset();
+		} catch (Exception e) {
+			throw new HyracksDataException(e);
+		}
+		
+		resultFrameAccessor.reset(newResultBuffers.get(0));
+		
+		// intersect temporary results with remaining inverted lists		
+		for(int i = 1; i < numQueryTokens; i++) {		
+			swap = prevResultBuffers;
+			prevResultBuffers = newResultBuffers;
+			newResultBuffers = swap;
+			try {
+				searchKey.reset(queryTokenAccessor, i);
+				btree.search(btreeCursor, pred, leafFrame, interiorFrame);
+				maxResultBufIdx = intersectList(btreeCursor, prevResultBuffers, maxResultBufIdx, newResultBuffers);			
+			} catch (Exception e) {
+				throw new HyracksDataException(e);
+			}		
+			btreeCursor.reset();			
+		}				
+	}	
+		
+	private int appendTupleToNewResults(IBTreeCursor btreeCursor, int newBufIdx) throws IOException {
+		ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
+		
+		ITupleReference tuple = btreeCursor.getTuple();		
+		resultTupleBuilder.reset();
+		DataOutput dos = resultTupleBuilder.getDataOutput();
+		for(int i = 0; i < numValueFields; i++) {
+			int fIdx = numKeyFields + i;
+			dos.write(tuple.getFieldData(fIdx), tuple.getFieldStart(fIdx), tuple.getFieldLength(fIdx));
+			resultTupleBuilder.addFieldEndOffset();
+		}
+		
+		if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0, resultTupleBuilder.getSize())) {
+			newBufIdx++;
+			if(newBufIdx >= newResultBuffers.size()) {
+				newResultBuffers.add(ctx.getResourceManager().allocateFrame());
+			}
+			newCurrentBuffer = newResultBuffers.get(newBufIdx);										
+			resultTupleAppender.reset(newCurrentBuffer, true);
+			if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0, resultTupleBuilder.getSize())) {
+				throw new IllegalStateException();
+			}
+		}		
+		
+		return newBufIdx;
+	}
+	
+	private int intersectList(IBTreeCursor btreeCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws IOException, Exception {
+		
+		int newBufIdx = 0;
+		ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+		
+		int prevBufIdx = 0;
+		ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+				
+		resultTupleBuilder.reset();
+		resultTupleAppender.reset(newCurrentBuffer, true);
+		resultFrameAccessor.reset(prevCurrentBuffer);
+		
+		// WARNING: not very efficient but good enough for the first cut
+		boolean advanceCursor = true;
+		boolean advancePrevResult = false;
+		int resultTidx = 0;
+		
+		while( (!advanceCursor || btreeCursor.hasNext()) && prevBufIdx <= maxPrevBufIdx && resultTidx < resultFrameAccessor.getTupleCount()) {
+			
+			if(advanceCursor) btreeCursor.next();
+			ITupleReference tuple = btreeCursor.getTuple();		
+			
+			int cmp = 0;
+			for(int i = 0; i < valueCmps.length; i++) {
+				int tupleFidx = numKeyFields + i;
+				cmp = valueCmps[i].compare(tuple.getFieldData(tupleFidx), 
+						tuple.getFieldStart(tupleFidx), 
+						tuple.getFieldLength(tupleFidx), 
+						resultFrameAccessor.getBuffer().array(), 
+						resultFrameAccessor.getTupleStartOffset(resultTidx) + resultFrameAccessor.getFieldSlotsLength() + resultFrameAccessor.getFieldStartOffset(resultTidx, i), 
+						resultFrameAccessor.getFieldLength(resultTidx, i));
+				if(cmp != 0) break;
+			}			
+						
+			// match found			
+			if(cmp == 0) {
+				newBufIdx = appendTupleToNewResults(btreeCursor, newBufIdx);
+				
+				advanceCursor = true;
+				advancePrevResult = true;				
+			}
+			else {
+				if(cmp < 0) {
+					advanceCursor = true;
+					advancePrevResult = false;
+				}
+				else {
+					advanceCursor = false;
+					advancePrevResult = true;
+				}
+			}
+			
+			if(advancePrevResult) {
+				resultTidx++;
+				if(resultTidx >= resultFrameAccessor.getTupleCount()) {
+					prevBufIdx++;
+					if(prevBufIdx <= maxPrevBufIdx) {
+						prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+						resultFrameAccessor.reset(prevCurrentBuffer);
+						resultTidx = 0;
+					}
+				}
+			}			
+		}	
+		
+		return newBufIdx;
+	}
+
+	@Override
+	public IInvertedIndexResultCursor getResultCursor() {
+		resultCursor.setResults(newResultBuffers, maxResultBufIdx + 1);
+		return resultCursor;
+	}		
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..9a47280
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java

@@ -0,0 +1,83 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+
+public class DelimitedUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+	
+	private static final RecordDescriptor tokenSchema = 
+		new RecordDescriptor(new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE } );
+	
+	private final char delimiter;
+	private byte[] data;
+	private int start;
+	private int length;
+		
+	private int tokenLength;
+	private int tokenStart;
+	private int pos;	
+		
+	public DelimitedUTF8StringBinaryTokenizer(char delimiter) {
+		this.delimiter = delimiter;
+	}
+	
+	@Override
+	public int getTokenLength() {
+		return tokenLength;
+	}
+
+	@Override
+	public int getTokenStartOff() {
+		return tokenStart;
+	}
+
+	@Override
+	public boolean hasNext() {		
+		if(pos >= start + length) return false;
+		else return true;						
+	}
+	
+	@Override
+	public void next() {
+		tokenLength = 0;
+		tokenStart = pos;
+		while(pos < start + length) {
+			int len = StringUtils.charSize(data, pos);			
+			char ch = StringUtils.charAt(data, pos);	
+			pos += len;
+			if(ch == delimiter) {
+				break;
+			}
+			tokenLength += len;
+		}
+	}
+	
+	@Override
+	public void reset(byte[] data, int start, int length) {		
+		this.data = data;
+		this.start = start;
+		this.pos = start;
+		this.length = length;			
+		this.tokenLength = 0;
+		this.tokenStart = 0; 
+        pos += 2; // UTF-8 specific
+	}
+	
+	@Override
+	public void writeToken(DataOutput dos) throws IOException {
+		// WARNING: 2-byte length indicator is specific to UTF-8
+		dos.writeShort((short)tokenLength);
+		dos.write(data, tokenStart, tokenLength);
+	}
+
+	@Override
+	public RecordDescriptor getTokenSchema() {
+		return tokenSchema;
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..6432c4a
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java

@@ -0,0 +1,19 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+
+public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+	
+	private static final long serialVersionUID = 1L;
+	private final char delimiter;
+	
+	public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter) {
+		this.delimiter = delimiter;
+	}
+	
+	@Override
+	public IBinaryTokenizer createBinaryTokenizer() {
+		return new DelimitedUTF8StringBinaryTokenizer(delimiter);
+	}	
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..68fd502
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java

@@ -0,0 +1,130 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+
+public class HashedQGramUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+	
+	private static final RecordDescriptor tokenSchema = 
+		new RecordDescriptor(new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE } );
+	
+	private final boolean prePost;
+	private final int q;
+	private byte[] data;
+	private int start;
+	private int length;	
+	private int gramNum;
+	private int utflen;
+	
+	private final char PRECHAR = '#';
+	private final char POSTCHAR = '$';
+	
+	private int charPos;
+	private int pos;
+	private int hashedGram;
+	
+	HashedQGramUTF8StringBinaryTokenizer(int q, boolean prePost) {
+		this.prePost = prePost;
+		this.q = q;
+	}
+	
+	@Override
+	public int getTokenLength() {
+		// the produced token (hashed q-gram) is derived from data 
+		// but not contained in it
+		// therefore this call does not make sense
+		return -1; 
+	}
+
+	@Override
+	public int getTokenStartOff() {
+		// the produced token (hashed q-gram) is derived from data 
+		// but not contained in it
+		// therefore this call does not make sense
+		return -1;
+	}
+
+	@Override
+	public boolean hasNext() {
+		if((prePost && pos >= start + length) || (!prePost && pos >= start + length - q)) return false;
+		else return true;		
+	}
+
+	@Override
+	public void next() {
+		hashedGram = 0;
+		if(prePost) {			
+			if(gramNum < q) {				
+				for(int i = 0; i < q - gramNum; i++) {
+					hashedGram = 31 * hashedGram + PRECHAR;
+				}				
+				
+				int tmpPos = pos;
+				for(int i = 0; i < gramNum; i++) {
+					hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
+					tmpPos += StringUtils.charSize(data, tmpPos);
+				}
+			}
+			else {								
+				int stopStr = Math.min(charPos + q, utflen);
+				int tmpPos = pos;
+				for(int i = charPos; i < stopStr; i++) {
+					hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
+					tmpPos += StringUtils.charSize(data, tmpPos);
+				}
+				
+				int stopPost = (charPos + q) - (utflen);				
+				for(int i = 0; i < stopPost; i++) {
+					hashedGram = 31 * hashedGram + POSTCHAR;
+				}
+				pos += StringUtils.charSize(data, pos);
+				charPos++;
+			}
+			gramNum++;										
+		}
+		else {			
+			int tmpPos = pos;
+			for(int i = charPos; i < charPos + q; i++) {	
+				hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
+				tmpPos += StringUtils.charSize(data, tmpPos);
+			}
+			pos += StringUtils.charSize(data, pos);
+			charPos++;
+		}		
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length) {
+		this.data = data;
+		this.start = start;	
+		this.length = length;
+		this.utflen = StringUtils.getUTFLen(data, start);
+		this.pos = start + 2; // UTF-8 specific
+		this.gramNum = 1;
+		this.charPos = 0;
+	}
+	
+	@Override
+	public void writeToken(DataOutput dos) throws IOException {
+		dos.writeInt(hashedGram);		
+	}
+	
+	public char getPreChar() {
+		return PRECHAR;
+	}
+	
+	public char getPostChar() {
+		return POSTCHAR;
+	}
+
+	@Override
+	public RecordDescriptor getTokenSchema() {
+		return tokenSchema;
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..98fd9ad
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java

@@ -0,0 +1,21 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+
+public class HashedQGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+	
+	private static final long serialVersionUID = 1L;
+	private final int q;
+	private final boolean prePost;
+	
+	public HashedQGramUTF8StringBinaryTokenizerFactory(int q, boolean prePost) {
+		this.q = q;
+		this.prePost = prePost;
+	}
+	
+	@Override
+	public IBinaryTokenizer createBinaryTokenizer() {
+		return new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
new file mode 100644
index 0000000..3f00e4c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java

@@ -0,0 +1,262 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
+import edu.uci.ics.hyracks.control.nc.runtime.RootHyracksContext;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.MetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.btree.tuples.SimpleTupleWriterFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SimpleConjunctiveSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.common.buffercache.BufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ClockPageReplacementStrategy;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICacheMemoryAllocator;
+import edu.uci.ics.hyracks.storage.common.buffercache.IPageReplacementStrategy;
+import edu.uci.ics.hyracks.storage.common.file.FileInfo;
+import edu.uci.ics.hyracks.storage.common.file.FileManager;
+
+public class SimpleConjunctiveSearcherTest {
+	
+	// testing params
+	private static final int PAGE_SIZE = 256;
+    private static final int NUM_PAGES = 10;
+    private static final int HYRACKS_FRAME_SIZE = 256;
+
+	// realistic params
+	//private static final int PAGE_SIZE = 32768;
+    //private static final int NUM_PAGES = 1000;
+    //private static final int HYRACKS_FRAME_SIZE = 32768;
+    
+	private String tmpDir = System.getProperty("java.io.tmpdir");
+	
+    public class BufferAllocator implements ICacheMemoryAllocator {
+        @Override
+        public ByteBuffer[] allocate(int pageSize, int numPages) {
+            ByteBuffer[] buffers = new ByteBuffer[numPages];
+            for (int i = 0; i < numPages; ++i) {
+                buffers[i] = ByteBuffer.allocate(pageSize);
+            }
+            return buffers;
+        }
+    }
+    
+	@Test	
+	public void test01() throws Exception { 
+		    	
+    	FileManager fileManager = new FileManager();
+    	ICacheMemoryAllocator allocator = new BufferAllocator();
+    	IPageReplacementStrategy prs = new ClockPageReplacementStrategy();
+    	IBufferCache bufferCache = new BufferCache(allocator, prs, fileManager, PAGE_SIZE, NUM_PAGES);
+
+    	File f = new File(tmpDir + "/" + "btreetest.bin");
+    	RandomAccessFile raf = new RandomAccessFile(f, "rw");
+    	int fileId = 0;
+    	FileInfo fi = new FileInfo(fileId, raf);
+    	fileManager.registerFile(fi);
+    	
+    	// declare fields
+    	int fieldCount = 2;
+    	ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
+        typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
+        typeTraits[1] = new TypeTrait(4);
+    	
+        // declare keys
+    	int keyFieldCount = 2;
+    	IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
+    	cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+    	cmps[1] = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+    	
+    	MultiComparator cmp = new MultiComparator(typeTraits, cmps);
+    	    	
+    	//TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
+    	SimpleTupleWriterFactory tupleWriterFactory = new SimpleTupleWriterFactory();
+        IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
+        IBTreeInteriorFrameFactory interiorFrameFactory = new NSMInteriorFrameFactory(tupleWriterFactory);
+        IBTreeMetaDataFrameFactory metaFrameFactory = new MetaDataFrameFactory();
+        
+        IBTreeLeafFrame leafFrame = leafFrameFactory.getFrame();
+        IBTreeInteriorFrame interiorFrame = interiorFrameFactory.getFrame();
+        IBTreeMetaDataFrame metaFrame = metaFrameFactory.getFrame();   
+        
+    	BTree btree = new BTree(bufferCache, interiorFrameFactory, leafFrameFactory, cmp);
+    	btree.create(fileId, leafFrame, metaFrame);
+    	btree.open(fileId);
+
+    	Random rnd = new Random();
+    	rnd.setSeed(50);
+
+    	IHyracksContext ctx = new RootHyracksContext(HYRACKS_FRAME_SIZE);        
+        ByteBuffer frame = ctx.getResourceManager().allocateFrame();
+		FrameTupleAppender appender = new FrameTupleAppender(ctx);				
+		ArrayTupleBuilder tb = new ArrayTupleBuilder(cmp.getFieldCount());
+		DataOutput dos = tb.getDataOutput();
+		
+		ISerializerDeserializer[] btreeSerde = { UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+		RecordDescriptor btreeRecDesc = new RecordDescriptor(btreeSerde);
+		IFrameTupleAccessor accessor = new FrameTupleAccessor(ctx, btreeRecDesc);
+		accessor.reset(frame);
+		FrameTupleReference tuple = new FrameTupleReference();
+    	
+		List<String> tokens = new ArrayList<String>();
+		tokens.add("computer");
+		tokens.add("hyracks");
+		tokens.add("fast");
+		tokens.add("university");
+		tokens.add("science");		
+		tokens.add("major");				
+		
+		int maxId = 1000;
+		int addProb = 0;
+		int addProbStep = 2;
+		
+    	for (int i = 0; i < tokens.size(); i++) {
+    		
+    		addProb += addProbStep;
+    		for(int j = 0; j < maxId; j++) {    			
+    			if((Math.abs(rnd.nextInt()) % addProb) == 0) {
+    				tb.reset();
+    				UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), dos);
+    				tb.addFieldEndOffset();
+    				IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
+    				tb.addFieldEndOffset();        	
+
+    				appender.reset(frame, true);
+    				appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
+
+    				tuple.reset(accessor, 0);
+
+    				try {
+    					btree.insert(tuple, leafFrame, interiorFrame, metaFrame);
+    				} catch (Exception e) {
+    					e.printStackTrace();
+    				}    	
+    			}
+    		}
+    	}     
+    	
+    	// build query as tuple reference
+    	ISerializerDeserializer[] querySerde = { UTF8StringSerializerDeserializer.INSTANCE };
+		RecordDescriptor queryRecDesc = new RecordDescriptor(querySerde);
+    	
+    	FrameTupleAppender queryAppender = new FrameTupleAppender(ctx);				
+		ArrayTupleBuilder queryTb = new ArrayTupleBuilder(querySerde.length);
+		DataOutput queryDos = queryTb.getDataOutput();		
+		
+		IFrameTupleAccessor queryAccessor = new FrameTupleAccessor(ctx, queryRecDesc);
+		queryAccessor.reset(frame);		
+		FrameTupleReference queryTuple = new FrameTupleReference();
+		
+    	String query = "computer hyracks fast";
+    	char queryDelimiter = ' ';
+    	IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(queryDelimiter);    	
+    	
+    	queryTb.reset();
+    	UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
+    	queryTb.addFieldEndOffset();    		    		
+    	
+    	queryAppender.reset(frame, true);
+    	queryAppender.append(queryTb.getFieldEndOffsets(), queryTb.getByteArray(), 0, queryTb.getSize());		
+    	queryTuple.reset(queryAccessor, 0);
+    	    	
+    	int numKeyFields = 1;
+    	int numValueFields = 1;
+    	ISerializerDeserializer[] resultSerde = new ISerializerDeserializer[numValueFields];
+    	for(int i = 0; i < numValueFields; i++) {
+    		resultSerde[i] = btreeSerde[numKeyFields + i]; 
+    	}
+    	RecordDescriptor resultRecDesc = new RecordDescriptor(resultSerde);
+    	FrameTupleAccessor resultAccessor = new FrameTupleAccessor(ctx, resultRecDesc);
+    	FrameTupleReference resultTuple = new FrameTupleReference();
+    	
+    	SimpleConjunctiveSearcher searcher = new SimpleConjunctiveSearcher(ctx, btree, btreeRecDesc, queryTokenizer, numKeyFields, numValueFields);
+    	
+    	long timeStart = System.currentTimeMillis();
+    	searcher.search(queryTuple, 0);
+    	long timeEnd = System.currentTimeMillis();    	
+    	System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
+    	    	    	
+    	System.out.println("INTERSECTION RESULTS");
+    	IInvertedIndexResultCursor resultCursor = searcher.getResultCursor();
+    	while(resultCursor.hasNext()) {
+    		resultCursor.next();
+    		resultAccessor.reset(resultCursor.getBuffer());
+    		for(int i = 0; i < resultAccessor.getTupleCount(); i++) {
+				resultTuple.reset(resultAccessor, i);
+				for(int j = 0; j < resultTuple.getFieldCount(); j++) {
+					ByteArrayInputStream inStream = new ByteArrayInputStream(resultTuple.getFieldData(j), resultTuple.getFieldStart(j), resultTuple.getFieldLength(j));
+					DataInput dataIn = new DataInputStream(inStream);
+					Object o = resultSerde[j].deserialize(dataIn);		
+					System.out.print(o + " ");
+				}
+				System.out.println();
+			}
+    	}
+    	    	    	
+		/*
+		IBinaryComparator[] searchCmps = new IBinaryComparator[1];
+        searchCmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+        MultiComparator searchCmp = new MultiComparator(typeTraits, searchCmps);
+		
+    	// ordered scan        
+        IBTreeCursor scanCursor = new RangeSearchCursor(leafFrame);
+        RangePredicate nullPred = new RangePredicate(true, null, null, null);
+        btree.search(scanCursor, nullPred, leafFrame, interiorFrame);
+        
+        try {
+            while (scanCursor.hasNext()) {
+                scanCursor.next();
+                ITupleReference frameTuple = scanCursor.getTuple();                
+                String rec = cmp.printTuple(frameTuple, recDescSers);
+                System.out.println(rec);
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        } finally {
+            scanCursor.close();
+        }
+        */
+                
+        btree.close();
+
+        bufferCache.close();
+        fileManager.close();
+	}		
+}

diff --git a/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
new file mode 100644
index 0000000..1c1f1f9
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java

@@ -0,0 +1,173 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.util.ArrayList;
+import java.util.Random;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.data.hash.UTF8StringBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+
+public class TokenizerTest {
+
+	// testing DelimitedUTF8StringBinaryTokenizer
+	@Test	
+	public void test01() throws Exception { 
+		Random rnd = new Random(50);
+		
+		int numDocs = 100;
+		int maxWords = 1000;
+		int maxWordLength = 50;
+		char delimiter = ' ';
+				
+		DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(delimiter);
+		
+		// create a bunch of documents
+		for(int i = 0; i < numDocs; i++) {
+			
+			// create a single document with a bunch of words
+			int words = (Math.abs(rnd.nextInt()) % maxWords) + 1;
+			StringBuilder strBuilder = new StringBuilder();
+			for(int j = 0; j < words; j++) {
+				int len = (Math.abs(rnd.nextInt()) % maxWordLength) + 1;
+				String s = randomString(len, rnd);
+				strBuilder.append(s);
+				if(j < words-1) strBuilder.append(delimiter);				
+			}
+			
+			String doc = strBuilder.toString();			
+			
+			// serialize document into baaos
+			ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+			DataOutputStream dos = new DataOutputStream(baaos);			
+			UTF8StringSerializerDeserializer.INSTANCE.serialize(doc, dos);
+			byte[] data = baaos.toByteArray();
+			
+			// use binary tokenizer and compare with Java tokenizer
+			String[] cmpTokens = doc.split(new String(new char[] { delimiter }));
+			int cmpCounter = 0;
+			
+			tok.reset(data, 0, data.length);
+			while(tok.hasNext()) {
+				tok.next();
+				
+				// write token to outputstream
+				ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+				DataOutputStream dosWrite = new DataOutputStream(baaosWrite);			
+				tok.writeToken(dosWrite);
+
+				// deserialize token to get string object
+				ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+				DataInput dataIn = new DataInputStream(inStream);
+				String s = UTF8StringSerializerDeserializer.INSTANCE.deserialize(dataIn);
+				
+				Assert.assertEquals(s, cmpTokens[cmpCounter++]);
+			}			
+		}		
+	}
+	
+	// testing HashedQGramUTF8StringBinaryTokenizer
+	@Test
+	public void test02() throws Exception {
+		Random rnd = new Random(50);
+		
+		int numStrings = 1000;
+		int maxStrLen = 100;
+		int minQ = 2;
+		int maxQ = 10;
+		
+		// we test the correctness of HashedQGramUTF8StringBinaryTokenizer as follows:
+		// 1.1. tokenize the string into q-gram strings
+		// 1.2. serialize q-gram strings into bytes
+		// 1.3. compute hashed gram with UTF8StringBinaryHashFunctionFactory
+		// 2.1. serialize string into bytes
+		// 2.2. tokenize serialized string into hashed q-grams
+		// 2.3. test whether hashed grams from 1.3. and 2.3. are equal
+		for(int i = 0; i < numStrings; i++) {
+			int q = (Math.abs(rnd.nextInt()) % (maxQ - minQ)) + minQ;
+			int strLen = (Math.abs(rnd.nextInt()) % (maxStrLen - q)) + q; 
+			String str = randomString(strLen, rnd);
+			
+			// randomly choose pre and postfixing
+			boolean prePost = false;
+			if(Math.abs(rnd.nextInt()) % 2 == 0) prePost = true;
+			
+			HashedQGramUTF8StringBinaryTokenizer qgramTok = new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
+			
+			String extendedString = str;
+			if(prePost) {
+				// pre and postfix string
+				StringBuilder strBuilder = new StringBuilder();
+				for(int j = 0; j < q - 1; j++) strBuilder.append(qgramTok.getPreChar());
+				strBuilder.append(str);
+				for(int j = 0; j < q - 1; j++) strBuilder.append(qgramTok.getPostChar());				
+				extendedString = strBuilder.toString();
+			}
+			
+			// generate q-grams in deserialized form
+			ArrayList<String> javaGrams = new ArrayList<String>();
+			for(int j = 0; j < extendedString.length() - q + 1; j++) {
+				javaGrams.add(extendedString.substring(j, j + q));
+			}
+			
+			// serialize string for use in binary gram tokenizer
+			ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+			DataOutputStream dos = new DataOutputStream(baaos);			
+			UTF8StringSerializerDeserializer.INSTANCE.serialize(str, dos);
+			byte[] data = baaos.toByteArray();
+						
+			qgramTok.reset(data, 0, data.length);
+			
+			int counter = 0;
+			while(qgramTok.hasNext()) {
+				qgramTok.next();
+				
+				// write token to outputstream
+				ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+				DataOutputStream dosWrite = new DataOutputStream(baaosWrite);			
+				qgramTok.writeToken(dosWrite);
+
+				// deserialize token to get hashed gram
+				ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+				DataInput dataIn = new DataInputStream(inStream);
+				Integer binHashedGram = IntegerSerializerDeserializer.INSTANCE.deserialize(dataIn);
+				
+				// create hashed gram to test against
+				ByteArrayAccessibleOutputStream baaosCmp = new ByteArrayAccessibleOutputStream();
+				DataOutputStream dosCmp = new DataOutputStream(baaosCmp);			
+				UTF8StringSerializerDeserializer.INSTANCE.serialize(javaGrams.get(counter), dosCmp);				
+				
+				IBinaryHashFunction strHasher = UTF8StringBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
+				byte[] cmpData = baaosCmp.toByteArray();
+				int cmpHash = strHasher.hash(cmpData, 0, cmpData.length);
+								
+				Assert.assertEquals(binHashedGram.intValue(), cmpHash);
+				
+				counter++;
+			}			
+		}		
+	}
+	
+	public static String randomString(int length, Random random) {
+        int maxAttempts = 1000;
+        int count = 0;
+        while(count < maxAttempts) {		
+        	String s = Long.toHexString(Double.doubleToLongBits(random.nextDouble()));
+        	StringBuilder strBuilder = new StringBuilder();
+        	for (int i = 0; i < s.length() && i < length; i++) {
+        		strBuilder.append(s.charAt(Math.abs(random.nextInt()) % s.length()));
+        	}        	
+        	if(strBuilder.length() > 0) return strBuilder.toString();        	
+        	count++; 
+        }        
+        return "abc";
+    }
+}
commit	da0075823940f922d9e0cf0bfae8689a61ab09cd	[log] [tgz]
author	alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053>	Thu Nov 18 08:20:11 2010 +0000
committer	alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053>	Thu Nov 18 08:20:11 2010 +0000
tree	66f92b1a53792a0ae0b141d8c75703f5a453959a
parent	414d50961b5e154afba05b054357c6e712114ae5 [diff]