Merged r289:290 from the hyracks_io_management branch
git-svn-id: https://hyracks.googlecode.com/svn/trunk/hyracks@291 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.classpath b/hyracks-tests/hyracks-storage-am-invertedindex-test/.classpath
new file mode 100644
index 0000000..e44aa2f
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.classpath
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" path="src/test/java"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
+ <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/>
+ <classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.project b/hyracks-tests/hyracks-storage-am-invertedindex-test/.project
new file mode 100644
index 0000000..f60b2f9
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>hyracks-storage-am-invertedindex-test</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ <buildCommand>
+ <name>org.maven.ide.eclipse.maven2Builder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ <nature>org.maven.ide.eclipse.maven2Nature</nature>
+ </natures>
+</projectDescription>
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.eclipse.jdt.core.prefs b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..3cd389e
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,6 @@
+#Thu Jan 06 11:27:16 PST 2011
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.6
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.maven.ide.eclipse.prefs b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.maven.ide.eclipse.prefs
new file mode 100644
index 0000000..99b89a6
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/.settings/org.maven.ide.eclipse.prefs
@@ -0,0 +1,9 @@
+#Thu Jan 06 11:27:16 PST 2011
+activeProfiles=
+eclipse.preferences.version=1
+fullBuildGoals=process-test-resources
+includeModules=false
+resolveWorkspaceProjects=true
+resourceFilterGoals=process-resources resources\:testResources
+skipCompilerPlugin=true
+version=1
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml b/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml
new file mode 100644
index 0000000..b3c62ae
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml
@@ -0,0 +1,55 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-storage-am-invertedindex-test</artifactId>
+ <version>0.1.4-SNAPSHOT</version>
+
+ <parent>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-tests</artifactId>
+ <version>0.1.4-SNAPSHOT</version>
+ </parent>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.0.2</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ <dependencies>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-control-nc</artifactId>
+ <version>0.1.4-SNAPSHOT</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-storage-am-invertedindex</artifactId>
+ <version>0.1.4-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-test-support</artifactId>
+ <version>0.1.4-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <type>jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
new file mode 100644
index 0000000..0e53d3f
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
@@ -0,0 +1,297 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.UUID;
+
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.application.INCApplicationContext;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksJobletContext;
+import edu.uci.ics.hyracks.api.context.IHyracksRootContext;
+import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.MetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOp;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
+import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.btree.tuples.TypeAwareTupleWriterFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SimpleConjunctiveSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.common.IStorageManagerInterface;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICacheMemoryAllocator;
+import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
+import edu.uci.ics.hyracks.test.support.TestJobletContext;
+import edu.uci.ics.hyracks.test.support.TestNCApplicationContext;
+import edu.uci.ics.hyracks.test.support.TestRootContext;
+import edu.uci.ics.hyracks.test.support.TestStageletContext;
+import edu.uci.ics.hyracks.test.support.TestStorageManagerComponentHolder;
+import edu.uci.ics.hyracks.test.support.TestStorageManagerInterface;
+
+public class SimpleConjunctiveSearcherTest {
+ // testing params
+ // private static final int PAGE_SIZE = 256;
+ // private static final int NUM_PAGES = 10;
+ // private static final int HYRACKS_FRAME_SIZE = 256;
+
+ // realistic params
+ // private static final int PAGE_SIZE = 65536;
+ private static final int PAGE_SIZE = 32768;
+ private static final int NUM_PAGES = 10;
+ private static final int HYRACKS_FRAME_SIZE = 32768;
+
+ static {
+ TestStorageManagerComponentHolder.init(PAGE_SIZE, NUM_PAGES);
+ }
+
+ private String tmpDir = System.getProperty("java.io.tmpdir");
+
+ public class BufferAllocator implements ICacheMemoryAllocator {
+ @Override
+ public ByteBuffer[] allocate(int pageSize, int numPages) {
+ ByteBuffer[] buffers = new ByteBuffer[numPages];
+ for (int i = 0; i < numPages; ++i) {
+ buffers[i] = ByteBuffer.allocate(pageSize);
+ }
+ return buffers;
+ }
+ }
+
+ @Test
+ public void test01() throws Exception {
+ IHyracksRootContext rootCtx = new TestRootContext(HYRACKS_FRAME_SIZE);
+ INCApplicationContext appCtx = new TestNCApplicationContext(rootCtx);
+ IHyracksJobletContext jobletCtx = new TestJobletContext(appCtx, UUID.randomUUID(), 0);
+ IHyracksStageletContext stageletCtx = new TestStageletContext(jobletCtx, UUID.randomUUID());
+
+ IStorageManagerInterface smi = new TestStorageManagerInterface();
+
+ IBufferCache bufferCache = smi.getBufferCache(stageletCtx);
+ IFileMapProvider fmp = smi.getFileMapProvider(stageletCtx);
+ FileReference file = new FileReference(new File(tmpDir + "/" + "btreetest.bin"));
+ bufferCache.createFile(file);
+ int fileId = fmp.lookupFileId(file);
+ bufferCache.openFile(fileId);
+
+ // declare fields
+ int fieldCount = 2;
+ ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
+ typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
+ typeTraits[1] = new TypeTrait(4);
+
+ // declare keys
+ int keyFieldCount = 2;
+ IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
+ cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ cmps[1] = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+
+ MultiComparator cmp = new MultiComparator(typeTraits, cmps);
+
+ TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
+ // SimpleTupleWriterFactory tupleWriterFactory = new
+ // SimpleTupleWriterFactory();
+ IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
+ // IBTreeLeafFrameFactory leafFrameFactory = new
+ // FieldPrefixNSMLeafFrameFactory(tupleWriterFactory);
+ IBTreeInteriorFrameFactory interiorFrameFactory = new NSMInteriorFrameFactory(tupleWriterFactory);
+ IBTreeMetaDataFrameFactory metaFrameFactory = new MetaDataFrameFactory();
+
+ IBTreeLeafFrame leafFrame = leafFrameFactory.getFrame();
+ IBTreeInteriorFrame interiorFrame = interiorFrameFactory.getFrame();
+ IBTreeMetaDataFrame metaFrame = metaFrameFactory.getFrame();
+
+ BTree btree = new BTree(bufferCache, interiorFrameFactory, leafFrameFactory, cmp);
+ btree.create(fileId, leafFrame, metaFrame);
+ btree.open(fileId);
+
+ Random rnd = new Random();
+ rnd.setSeed(50);
+
+ ByteBuffer frame = stageletCtx.allocateFrame();
+ FrameTupleAppender appender = new FrameTupleAppender(stageletCtx.getFrameSize());
+ ArrayTupleBuilder tb = new ArrayTupleBuilder(cmp.getFieldCount());
+ DataOutput dos = tb.getDataOutput();
+
+ ISerializerDeserializer[] btreeSerde = { UTF8StringSerializerDeserializer.INSTANCE,
+ IntegerSerializerDeserializer.INSTANCE };
+ RecordDescriptor btreeRecDesc = new RecordDescriptor(btreeSerde);
+ IFrameTupleAccessor accessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), btreeRecDesc);
+ accessor.reset(frame);
+ FrameTupleReference tuple = new FrameTupleReference();
+
+ List<String> tokens = new ArrayList<String>();
+ tokens.add("computer");
+ tokens.add("hyracks");
+ tokens.add("fast");
+ tokens.add("university");
+ tokens.add("science");
+ tokens.add("major");
+
+ int maxId = 10000;
+ int addProb = 0;
+ int addProbStep = 2;
+
+ BTreeOpContext opCtx = btree.createOpContext(BTreeOp.BTO_INSERT, leafFrame, interiorFrame, metaFrame);
+
+ for (int i = 0; i < tokens.size(); i++) {
+
+ addProb += addProbStep;
+ for (int j = 0; j < maxId; j++) {
+ if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
+ tb.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), dos);
+ tb.addFieldEndOffset();
+ IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
+ tb.addFieldEndOffset();
+
+ appender.reset(frame, true);
+ appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
+
+ tuple.reset(accessor, 0);
+
+ try {
+ btree.insert(tuple, opCtx);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ int numPages = btree.getMaxPage(metaFrame);
+ System.out.println("NUMPAGES: " + numPages);
+
+ // build query as tuple reference
+ ISerializerDeserializer[] querySerde = { UTF8StringSerializerDeserializer.INSTANCE };
+ RecordDescriptor queryRecDesc = new RecordDescriptor(querySerde);
+
+ FrameTupleAppender queryAppender = new FrameTupleAppender(stageletCtx.getFrameSize());
+ ArrayTupleBuilder queryTb = new ArrayTupleBuilder(querySerde.length);
+ DataOutput queryDos = queryTb.getDataOutput();
+
+ IFrameTupleAccessor queryAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), queryRecDesc);
+ queryAccessor.reset(frame);
+ FrameTupleReference queryTuple = new FrameTupleReference();
+
+ String query = "computer hyracks fast";
+ char queryDelimiter = ' ';
+ IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(queryDelimiter);
+
+ queryTb.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
+ queryTb.addFieldEndOffset();
+
+ queryAppender.reset(frame, true);
+ queryAppender.append(queryTb.getFieldEndOffsets(), queryTb.getByteArray(), 0, queryTb.getSize());
+ queryTuple.reset(queryAccessor, 0);
+
+ int numKeyFields = 1;
+ int numValueFields = 1;
+ ISerializerDeserializer[] resultSerde = new ISerializerDeserializer[numValueFields];
+ for (int i = 0; i < numValueFields; i++) {
+ resultSerde[i] = btreeSerde[numKeyFields + i];
+ }
+ RecordDescriptor resultRecDesc = new RecordDescriptor(resultSerde);
+ FrameTupleAccessor resultAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), resultRecDesc);
+ FrameTupleReference resultTuple = new FrameTupleReference();
+
+ SimpleConjunctiveSearcher searcher = new SimpleConjunctiveSearcher(stageletCtx, btree, btreeRecDesc,
+ queryTokenizer, numKeyFields, numValueFields);
+
+ long timeStart = System.currentTimeMillis();
+ searcher.search(queryTuple, 0);
+ long timeEnd = System.currentTimeMillis();
+ System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
+
+ // System.out.println("INTERSECTION RESULTS");
+ IInvertedIndexResultCursor resultCursor = searcher.getResultCursor();
+ while (resultCursor.hasNext()) {
+ resultCursor.next();
+ resultAccessor.reset(resultCursor.getBuffer());
+ for (int i = 0; i < resultAccessor.getTupleCount(); i++) {
+ resultTuple.reset(resultAccessor, i);
+ for (int j = 0; j < resultTuple.getFieldCount(); j++) {
+ ByteArrayInputStream inStream = new ByteArrayInputStream(resultTuple.getFieldData(j),
+ resultTuple.getFieldStart(j), resultTuple.getFieldLength(j));
+ DataInput dataIn = new DataInputStream(inStream);
+ Object o = resultSerde[j].deserialize(dataIn);
+ System.out.print(o + " ");
+ }
+ System.out.println();
+ }
+ }
+
+ /*
+ * IBinaryComparator[] searchCmps = new IBinaryComparator[1];
+ * searchCmps[0] =
+ * UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ * MultiComparator searchCmp = new MultiComparator(typeTraits,
+ * searchCmps);
+ *
+ * // ordered scan IBTreeCursor scanCursor = new
+ * RangeSearchCursor(leafFrame); RangePredicate nullPred = new
+ * RangePredicate(true, null, null, true, true, null); BTreeOpContext
+ * searchOpCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame,
+ * interiorFrame, metaFrame); btree.search(scanCursor, nullPred,
+ * searchOpCtx);
+ *
+ * try { while (scanCursor.hasNext()) { scanCursor.next();
+ * ITupleReference frameTuple = scanCursor.getTuple(); String rec =
+ * cmp.printTuple(frameTuple, btreeSerde); System.out.println(rec); } }
+ * catch (Exception e) { e.printStackTrace(); } finally {
+ * scanCursor.close(); }
+ */
+
+ btree.close();
+ bufferCache.closeFile(fileId);
+ bufferCache.close();
+ }
+}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
new file mode 100644
index 0000000..47c75cf
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.util.ArrayList;
+import java.util.Random;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.data.hash.UTF8StringBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+
+public class TokenizerTest {
+
+ // testing DelimitedUTF8StringBinaryTokenizer
+ @Test
+ public void test01() throws Exception {
+ Random rnd = new Random(50);
+
+ int numDocs = 100;
+ int maxWords = 1000;
+ int maxWordLength = 50;
+ char delimiter = ' ';
+
+ DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(delimiter);
+
+ // create a bunch of documents
+ for (int i = 0; i < numDocs; i++) {
+
+ // create a single document with a bunch of words
+ int words = (Math.abs(rnd.nextInt()) % maxWords) + 1;
+ StringBuilder strBuilder = new StringBuilder();
+ for (int j = 0; j < words; j++) {
+ int len = (Math.abs(rnd.nextInt()) % maxWordLength) + 1;
+ String s = randomString(len, rnd);
+ strBuilder.append(s);
+ if (j < words - 1)
+ strBuilder.append(delimiter);
+ }
+
+ String doc = strBuilder.toString();
+
+ // serialize document into baaos
+ ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dos = new DataOutputStream(baaos);
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(doc, dos);
+ byte[] data = baaos.toByteArray();
+
+ // use binary tokenizer and compare with Java tokenizer
+ String[] cmpTokens = doc.split(new String(new char[] { delimiter }));
+ int cmpCounter = 0;
+
+ tok.reset(data, 0, data.length);
+ while (tok.hasNext()) {
+ tok.next();
+
+ // write token to outputstream
+ ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
+ tok.writeToken(dosWrite);
+
+ // deserialize token to get string object
+ ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+ DataInput dataIn = new DataInputStream(inStream);
+ String s = UTF8StringSerializerDeserializer.INSTANCE.deserialize(dataIn);
+
+ Assert.assertEquals(s, cmpTokens[cmpCounter++]);
+ }
+ }
+ }
+
+ // testing HashedQGramUTF8StringBinaryTokenizer
+ @Test
+ public void test02() throws Exception {
+ Random rnd = new Random(50);
+
+ int numStrings = 1000;
+ int maxStrLen = 100;
+ int minQ = 2;
+ int maxQ = 10;
+
+ // we test the correctness of HashedQGramUTF8StringBinaryTokenizer as
+ // follows:
+ // 1.1. tokenize the string into q-gram strings
+ // 1.2. serialize q-gram strings into bytes
+ // 1.3. compute hashed gram with UTF8StringBinaryHashFunctionFactory
+ // 2.1. serialize string into bytes
+ // 2.2. tokenize serialized string into hashed q-grams
+ // 2.3. test whether hashed grams from 1.3. and 2.3. are equal
+ for (int i = 0; i < numStrings; i++) {
+ int q = (Math.abs(rnd.nextInt()) % (maxQ - minQ)) + minQ;
+ int strLen = (Math.abs(rnd.nextInt()) % (maxStrLen - q)) + q;
+ String str = randomString(strLen, rnd);
+
+ // randomly choose pre and postfixing
+ boolean prePost = false;
+ if (Math.abs(rnd.nextInt()) % 2 == 0)
+ prePost = true;
+
+ HashedQGramUTF8StringBinaryTokenizer qgramTok = new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
+
+ String extendedString = str;
+ if (prePost) {
+ // pre and postfix string
+ StringBuilder strBuilder = new StringBuilder();
+ for (int j = 0; j < q - 1; j++)
+ strBuilder.append(qgramTok.getPreChar());
+ strBuilder.append(str);
+ for (int j = 0; j < q - 1; j++)
+ strBuilder.append(qgramTok.getPostChar());
+ extendedString = strBuilder.toString();
+ }
+
+ // generate q-grams in deserialized form
+ ArrayList<String> javaGrams = new ArrayList<String>();
+ for (int j = 0; j < extendedString.length() - q + 1; j++) {
+ javaGrams.add(extendedString.substring(j, j + q));
+ }
+
+ // serialize string for use in binary gram tokenizer
+ ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dos = new DataOutputStream(baaos);
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(str, dos);
+ byte[] data = baaos.toByteArray();
+
+ qgramTok.reset(data, 0, data.length);
+
+ int counter = 0;
+ while (qgramTok.hasNext()) {
+ qgramTok.next();
+
+ // write token to outputstream
+ ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
+ qgramTok.writeToken(dosWrite);
+
+ // deserialize token to get hashed gram
+ ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+ DataInput dataIn = new DataInputStream(inStream);
+ Integer binHashedGram = IntegerSerializerDeserializer.INSTANCE.deserialize(dataIn);
+
+ // create hashed gram to test against
+ ByteArrayAccessibleOutputStream baaosCmp = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dosCmp = new DataOutputStream(baaosCmp);
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(javaGrams.get(counter), dosCmp);
+
+ IBinaryHashFunction strHasher = UTF8StringBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
+ byte[] cmpData = baaosCmp.toByteArray();
+ int cmpHash = strHasher.hash(cmpData, 0, cmpData.length);
+
+ Assert.assertEquals(binHashedGram.intValue(), cmpHash);
+
+ counter++;
+ }
+ }
+ }
+
+ public static String randomString(int length, Random random) {
+ int maxAttempts = 1000;
+ int count = 0;
+ while (count < maxAttempts) {
+ String s = Long.toHexString(Double.doubleToLongBits(random.nextDouble()));
+ StringBuilder strBuilder = new StringBuilder();
+ for (int i = 0; i < s.length() && i < length; i++) {
+ strBuilder.append(s.charAt(Math.abs(random.nextInt()) % s.length()));
+ }
+ if (strBuilder.length() > 0)
+ return strBuilder.toString();
+ count++;
+ }
+ return "abc";
+ }
+}