Merged r289:290 from the hyracks_io_management branch

git-svn-id: https://hyracks.googlecode.com/svn/trunk/hyracks@291 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.classpath b/hyracks-tests/hyracks-storage-am-invertedindex-test/.classpath
new file mode 100644
index 0000000..e44aa2f
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.classpath
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src/test/java"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
+	<classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.project b/hyracks-tests/hyracks-storage-am-invertedindex-test/.project
new file mode 100644
index 0000000..f60b2f9
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>hyracks-storage-am-invertedindex-test</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.maven.ide.eclipse.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.maven.ide.eclipse.maven2Nature</nature>
+	</natures>
+</projectDescription>
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.eclipse.jdt.core.prefs b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..3cd389e
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,6 @@
+#Thu Jan 06 11:27:16 PST 2011
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.6
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.maven.ide.eclipse.prefs b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.maven.ide.eclipse.prefs
new file mode 100644
index 0000000..99b89a6
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.maven.ide.eclipse.prefs
@@ -0,0 +1,9 @@
+#Thu Jan 06 11:27:16 PST 2011
+activeProfiles=
+eclipse.preferences.version=1
+fullBuildGoals=process-test-resources
+includeModules=false
+resolveWorkspaceProjects=true
+resourceFilterGoals=process-resources resources\:testResources
+skipCompilerPlugin=true
+version=1
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml b/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml
new file mode 100644
index 0000000..b3c62ae
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml
@@ -0,0 +1,55 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>edu.uci.ics.hyracks</groupId>
+  <artifactId>hyracks-storage-am-invertedindex-test</artifactId>
+  <version>0.1.4-SNAPSHOT</version>
+
+  <parent>
+    <groupId>edu.uci.ics.hyracks</groupId>
+    <artifactId>hyracks-tests</artifactId>
+    <version>0.1.4-SNAPSHOT</version>
+  </parent>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>2.0.2</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-control-nc</artifactId>
+  		<version>0.1.4-SNAPSHOT</version>
+  		<scope>compile</scope>
+  	</dependency>
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-storage-am-invertedindex</artifactId>
+  		<version>0.1.4-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>compile</scope>
+  	</dependency>
+  	<dependency>
+  		<groupId>edu.uci.ics.hyracks</groupId>
+  		<artifactId>hyracks-test-support</artifactId>
+  		<version>0.1.4-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>test</scope>
+  	</dependency>
+  	<dependency>
+  		<groupId>junit</groupId>
+  		<artifactId>junit</artifactId>
+  		<version>4.8.1</version>
+  		<type>jar</type>
+  		<scope>test</scope>
+  	</dependency>
+  </dependencies>
+</project>
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
new file mode 100644
index 0000000..0e53d3f
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
@@ -0,0 +1,297 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.UUID;
+
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.application.INCApplicationContext;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksJobletContext;
+import edu.uci.ics.hyracks.api.context.IHyracksRootContext;
+import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.MetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOp;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
+import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.btree.tuples.TypeAwareTupleWriterFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SimpleConjunctiveSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.common.IStorageManagerInterface;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICacheMemoryAllocator;
+import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
+import edu.uci.ics.hyracks.test.support.TestJobletContext;
+import edu.uci.ics.hyracks.test.support.TestNCApplicationContext;
+import edu.uci.ics.hyracks.test.support.TestRootContext;
+import edu.uci.ics.hyracks.test.support.TestStageletContext;
+import edu.uci.ics.hyracks.test.support.TestStorageManagerComponentHolder;
+import edu.uci.ics.hyracks.test.support.TestStorageManagerInterface;
+
+public class SimpleConjunctiveSearcherTest {
+    // testing params
+    // private static final int PAGE_SIZE = 256;
+    // private static final int NUM_PAGES = 10;
+    // private static final int HYRACKS_FRAME_SIZE = 256;
+
+    // realistic params
+    // private static final int PAGE_SIZE = 65536;
+    private static final int PAGE_SIZE = 32768;
+    private static final int NUM_PAGES = 10;
+    private static final int HYRACKS_FRAME_SIZE = 32768;
+
+    static {
+        TestStorageManagerComponentHolder.init(PAGE_SIZE, NUM_PAGES);
+    }
+
+    private String tmpDir = System.getProperty("java.io.tmpdir");
+
+    public class BufferAllocator implements ICacheMemoryAllocator {
+        @Override
+        public ByteBuffer[] allocate(int pageSize, int numPages) {
+            ByteBuffer[] buffers = new ByteBuffer[numPages];
+            for (int i = 0; i < numPages; ++i) {
+                buffers[i] = ByteBuffer.allocate(pageSize);
+            }
+            return buffers;
+        }
+    }
+
+    @Test
+    public void test01() throws Exception {
+        IHyracksRootContext rootCtx = new TestRootContext(HYRACKS_FRAME_SIZE);
+        INCApplicationContext appCtx = new TestNCApplicationContext(rootCtx);
+        IHyracksJobletContext jobletCtx = new TestJobletContext(appCtx, UUID.randomUUID(), 0);
+        IHyracksStageletContext stageletCtx = new TestStageletContext(jobletCtx, UUID.randomUUID());
+
+        IStorageManagerInterface smi = new TestStorageManagerInterface();
+
+        IBufferCache bufferCache = smi.getBufferCache(stageletCtx);
+        IFileMapProvider fmp = smi.getFileMapProvider(stageletCtx);
+        FileReference file = new FileReference(new File(tmpDir + "/" + "btreetest.bin"));
+        bufferCache.createFile(file);
+        int fileId = fmp.lookupFileId(file);
+        bufferCache.openFile(fileId);
+
+        // declare fields
+        int fieldCount = 2;
+        ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
+        typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
+        typeTraits[1] = new TypeTrait(4);
+
+        // declare keys
+        int keyFieldCount = 2;
+        IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
+        cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+        cmps[1] = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+
+        MultiComparator cmp = new MultiComparator(typeTraits, cmps);
+
+        TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
+        // SimpleTupleWriterFactory tupleWriterFactory = new
+        // SimpleTupleWriterFactory();
+        IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
+        // IBTreeLeafFrameFactory leafFrameFactory = new
+        // FieldPrefixNSMLeafFrameFactory(tupleWriterFactory);
+        IBTreeInteriorFrameFactory interiorFrameFactory = new NSMInteriorFrameFactory(tupleWriterFactory);
+        IBTreeMetaDataFrameFactory metaFrameFactory = new MetaDataFrameFactory();
+
+        IBTreeLeafFrame leafFrame = leafFrameFactory.getFrame();
+        IBTreeInteriorFrame interiorFrame = interiorFrameFactory.getFrame();
+        IBTreeMetaDataFrame metaFrame = metaFrameFactory.getFrame();
+
+        BTree btree = new BTree(bufferCache, interiorFrameFactory, leafFrameFactory, cmp);
+        btree.create(fileId, leafFrame, metaFrame);
+        btree.open(fileId);
+
+        Random rnd = new Random();
+        rnd.setSeed(50);
+
+        ByteBuffer frame = stageletCtx.allocateFrame();
+        FrameTupleAppender appender = new FrameTupleAppender(stageletCtx.getFrameSize());
+        ArrayTupleBuilder tb = new ArrayTupleBuilder(cmp.getFieldCount());
+        DataOutput dos = tb.getDataOutput();
+
+        ISerializerDeserializer[] btreeSerde = { UTF8StringSerializerDeserializer.INSTANCE,
+                IntegerSerializerDeserializer.INSTANCE };
+        RecordDescriptor btreeRecDesc = new RecordDescriptor(btreeSerde);
+        IFrameTupleAccessor accessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), btreeRecDesc);
+        accessor.reset(frame);
+        FrameTupleReference tuple = new FrameTupleReference();
+
+        List<String> tokens = new ArrayList<String>();
+        tokens.add("computer");
+        tokens.add("hyracks");
+        tokens.add("fast");
+        tokens.add("university");
+        tokens.add("science");
+        tokens.add("major");
+
+        int maxId = 10000;
+        int addProb = 0;
+        int addProbStep = 2;
+
+        BTreeOpContext opCtx = btree.createOpContext(BTreeOp.BTO_INSERT, leafFrame, interiorFrame, metaFrame);
+
+        for (int i = 0; i < tokens.size(); i++) {
+
+            addProb += addProbStep;
+            for (int j = 0; j < maxId; j++) {
+                if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
+                    tb.reset();
+                    UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), dos);
+                    tb.addFieldEndOffset();
+                    IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
+                    tb.addFieldEndOffset();
+
+                    appender.reset(frame, true);
+                    appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
+
+                    tuple.reset(accessor, 0);
+
+                    try {
+                        btree.insert(tuple, opCtx);
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
+                }
+            }
+        }
+
+        int numPages = btree.getMaxPage(metaFrame);
+        System.out.println("NUMPAGES: " + numPages);
+
+        // build query as tuple reference
+        ISerializerDeserializer[] querySerde = { UTF8StringSerializerDeserializer.INSTANCE };
+        RecordDescriptor queryRecDesc = new RecordDescriptor(querySerde);
+
+        FrameTupleAppender queryAppender = new FrameTupleAppender(stageletCtx.getFrameSize());
+        ArrayTupleBuilder queryTb = new ArrayTupleBuilder(querySerde.length);
+        DataOutput queryDos = queryTb.getDataOutput();
+
+        IFrameTupleAccessor queryAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), queryRecDesc);
+        queryAccessor.reset(frame);
+        FrameTupleReference queryTuple = new FrameTupleReference();
+
+        String query = "computer hyracks fast";
+        char queryDelimiter = ' ';
+        IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(queryDelimiter);
+
+        queryTb.reset();
+        UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
+        queryTb.addFieldEndOffset();
+
+        queryAppender.reset(frame, true);
+        queryAppender.append(queryTb.getFieldEndOffsets(), queryTb.getByteArray(), 0, queryTb.getSize());
+        queryTuple.reset(queryAccessor, 0);
+
+        int numKeyFields = 1;
+        int numValueFields = 1;
+        ISerializerDeserializer[] resultSerde = new ISerializerDeserializer[numValueFields];
+        for (int i = 0; i < numValueFields; i++) {
+            resultSerde[i] = btreeSerde[numKeyFields + i];
+        }
+        RecordDescriptor resultRecDesc = new RecordDescriptor(resultSerde);
+        FrameTupleAccessor resultAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), resultRecDesc);
+        FrameTupleReference resultTuple = new FrameTupleReference();
+
+        SimpleConjunctiveSearcher searcher = new SimpleConjunctiveSearcher(stageletCtx, btree, btreeRecDesc,
+                queryTokenizer, numKeyFields, numValueFields);
+
+        long timeStart = System.currentTimeMillis();
+        searcher.search(queryTuple, 0);
+        long timeEnd = System.currentTimeMillis();
+        System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
+
+        // System.out.println("INTERSECTION RESULTS");
+        IInvertedIndexResultCursor resultCursor = searcher.getResultCursor();
+        while (resultCursor.hasNext()) {
+            resultCursor.next();
+            resultAccessor.reset(resultCursor.getBuffer());
+            for (int i = 0; i < resultAccessor.getTupleCount(); i++) {
+                resultTuple.reset(resultAccessor, i);
+                for (int j = 0; j < resultTuple.getFieldCount(); j++) {
+                    ByteArrayInputStream inStream = new ByteArrayInputStream(resultTuple.getFieldData(j),
+                            resultTuple.getFieldStart(j), resultTuple.getFieldLength(j));
+                    DataInput dataIn = new DataInputStream(inStream);
+                    Object o = resultSerde[j].deserialize(dataIn);
+                    System.out.print(o + " ");
+                }
+                System.out.println();
+            }
+        }
+
+        /*
+         * IBinaryComparator[] searchCmps = new IBinaryComparator[1];
+         * searchCmps[0] =
+         * UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+         * MultiComparator searchCmp = new MultiComparator(typeTraits,
+         * searchCmps);
+         * 
+         * // ordered scan IBTreeCursor scanCursor = new
+         * RangeSearchCursor(leafFrame); RangePredicate nullPred = new
+         * RangePredicate(true, null, null, true, true, null); BTreeOpContext
+         * searchOpCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame,
+         * interiorFrame, metaFrame); btree.search(scanCursor, nullPred,
+         * searchOpCtx);
+         * 
+         * try { while (scanCursor.hasNext()) { scanCursor.next();
+         * ITupleReference frameTuple = scanCursor.getTuple(); String rec =
+         * cmp.printTuple(frameTuple, btreeSerde); System.out.println(rec); } }
+         * catch (Exception e) { e.printStackTrace(); } finally {
+         * scanCursor.close(); }
+         */
+
+        btree.close();
+        bufferCache.closeFile(fileId);
+        bufferCache.close();
+    }
+}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
new file mode 100644
index 0000000..47c75cf
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.util.ArrayList;
+import java.util.Random;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.data.hash.UTF8StringBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+
+public class TokenizerTest {
+
+    // testing DelimitedUTF8StringBinaryTokenizer
+    @Test
+    public void test01() throws Exception {
+        Random rnd = new Random(50);
+
+        int numDocs = 100;
+        int maxWords = 1000;
+        int maxWordLength = 50;
+        char delimiter = ' ';
+
+        DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(delimiter);
+
+        // create a bunch of documents
+        for (int i = 0; i < numDocs; i++) {
+
+            // create a single document with a bunch of words
+            int words = (Math.abs(rnd.nextInt()) % maxWords) + 1;
+            StringBuilder strBuilder = new StringBuilder();
+            for (int j = 0; j < words; j++) {
+                int len = (Math.abs(rnd.nextInt()) % maxWordLength) + 1;
+                String s = randomString(len, rnd);
+                strBuilder.append(s);
+                if (j < words - 1)
+                    strBuilder.append(delimiter);
+            }
+
+            String doc = strBuilder.toString();
+
+            // serialize document into baaos
+            ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+            DataOutputStream dos = new DataOutputStream(baaos);
+            UTF8StringSerializerDeserializer.INSTANCE.serialize(doc, dos);
+            byte[] data = baaos.toByteArray();
+
+            // use binary tokenizer and compare with Java tokenizer
+            String[] cmpTokens = doc.split(new String(new char[] { delimiter }));
+            int cmpCounter = 0;
+
+            tok.reset(data, 0, data.length);
+            while (tok.hasNext()) {
+                tok.next();
+
+                // write token to outputstream
+                ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+                DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
+                tok.writeToken(dosWrite);
+
+                // deserialize token to get string object
+                ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+                DataInput dataIn = new DataInputStream(inStream);
+                String s = UTF8StringSerializerDeserializer.INSTANCE.deserialize(dataIn);
+
+                Assert.assertEquals(s, cmpTokens[cmpCounter++]);
+            }
+        }
+    }
+
+    // testing HashedQGramUTF8StringBinaryTokenizer
+    @Test
+    public void test02() throws Exception {
+        Random rnd = new Random(50);
+
+        int numStrings = 1000;
+        int maxStrLen = 100;
+        int minQ = 2;
+        int maxQ = 10;
+
+        // we test the correctness of HashedQGramUTF8StringBinaryTokenizer as
+        // follows:
+        // 1.1. tokenize the string into q-gram strings
+        // 1.2. serialize q-gram strings into bytes
+        // 1.3. compute hashed gram with UTF8StringBinaryHashFunctionFactory
+        // 2.1. serialize string into bytes
+        // 2.2. tokenize serialized string into hashed q-grams
+        // 2.3. test whether hashed grams from 1.3. and 2.3. are equal
+        for (int i = 0; i < numStrings; i++) {
+            int q = (Math.abs(rnd.nextInt()) % (maxQ - minQ)) + minQ;
+            int strLen = (Math.abs(rnd.nextInt()) % (maxStrLen - q)) + q;
+            String str = randomString(strLen, rnd);
+
+            // randomly choose pre and postfixing
+            boolean prePost = false;
+            if (Math.abs(rnd.nextInt()) % 2 == 0)
+                prePost = true;
+
+            HashedQGramUTF8StringBinaryTokenizer qgramTok = new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
+
+            String extendedString = str;
+            if (prePost) {
+                // pre and postfix string
+                StringBuilder strBuilder = new StringBuilder();
+                for (int j = 0; j < q - 1; j++)
+                    strBuilder.append(qgramTok.getPreChar());
+                strBuilder.append(str);
+                for (int j = 0; j < q - 1; j++)
+                    strBuilder.append(qgramTok.getPostChar());
+                extendedString = strBuilder.toString();
+            }
+
+            // generate q-grams in deserialized form
+            ArrayList<String> javaGrams = new ArrayList<String>();
+            for (int j = 0; j < extendedString.length() - q + 1; j++) {
+                javaGrams.add(extendedString.substring(j, j + q));
+            }
+
+            // serialize string for use in binary gram tokenizer
+            ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+            DataOutputStream dos = new DataOutputStream(baaos);
+            UTF8StringSerializerDeserializer.INSTANCE.serialize(str, dos);
+            byte[] data = baaos.toByteArray();
+
+            qgramTok.reset(data, 0, data.length);
+
+            int counter = 0;
+            while (qgramTok.hasNext()) {
+                qgramTok.next();
+
+                // write token to outputstream
+                ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+                DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
+                qgramTok.writeToken(dosWrite);
+
+                // deserialize token to get hashed gram
+                ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+                DataInput dataIn = new DataInputStream(inStream);
+                Integer binHashedGram = IntegerSerializerDeserializer.INSTANCE.deserialize(dataIn);
+
+                // create hashed gram to test against
+                ByteArrayAccessibleOutputStream baaosCmp = new ByteArrayAccessibleOutputStream();
+                DataOutputStream dosCmp = new DataOutputStream(baaosCmp);
+                UTF8StringSerializerDeserializer.INSTANCE.serialize(javaGrams.get(counter), dosCmp);
+
+                IBinaryHashFunction strHasher = UTF8StringBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
+                byte[] cmpData = baaosCmp.toByteArray();
+                int cmpHash = strHasher.hash(cmpData, 0, cmpData.length);
+
+                Assert.assertEquals(binHashedGram.intValue(), cmpHash);
+
+                counter++;
+            }
+        }
+    }
+
+    public static String randomString(int length, Random random) {
+        int maxAttempts = 1000;
+        int count = 0;
+        while (count < maxAttempts) {
+            String s = Long.toHexString(Double.doubleToLongBits(random.nextDouble()));
+            StringBuilder strBuilder = new StringBuilder();
+            for (int i = 0; i < s.length() && i < length; i++) {
+                strBuilder.append(s.charAt(Math.abs(random.nextInt()) % s.length()));
+            }
+            if (strBuilder.length() > 0)
+                return strBuilder.toString();
+            count++;
+        }
+        return "abc";
+    }
+}