cross merge fullstack_release_candidate into trunk
git-svn-id: https://hyracks.googlecode.com/svn/trunk/fullstack@3208 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml
index 2cf6ce2..59c8c46 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml
+++ b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/pom.xml
@@ -18,9 +18,10 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.0.2</version>
<configuration>
- <source>1.6</source>
- <target>1.6</target>
+ <source>1.7</source>
+ <target>1.7</target>
<encoding>UTF-8</encoding>
+ <fork>true</fork>
</configuration>
</plugin>
</plugins>
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java
index 5f15a91..3fb6407 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java
@@ -33,6 +33,7 @@
import org.junit.Before;
import org.junit.Test;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.AbstractUTF8Token;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
@@ -41,207 +42,196 @@
public class NGramTokenizerTest {
- private char PRECHAR = '#';
- private char POSTCHAR = '$';
+ private char PRECHAR = '#';
+ private char POSTCHAR = '$';
- private String str = "Jürgen S. Generic's Car";
- private byte[] inputBuffer;
+ private String str = "Jürgen S. Generic's Car";
+ private byte[] inputBuffer;
- private int gramLength = 3;
+ private int gramLength = 3;
- private void getExpectedGrams(String s, int gramLength,
- ArrayList<String> grams, boolean prePost) {
+ private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
- String tmp = s.toLowerCase();
- if (prePost) {
- StringBuilder preBuilder = new StringBuilder();
- for (int i = 0; i < gramLength - 1; i++) {
- preBuilder.append(PRECHAR);
- }
- String pre = preBuilder.toString();
+ String tmp = s.toLowerCase();
+ if (prePost) {
+ StringBuilder preBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ preBuilder.append(PRECHAR);
+ }
+ String pre = preBuilder.toString();
- StringBuilder postBuilder = new StringBuilder();
- for (int i = 0; i < gramLength - 1; i++) {
- postBuilder.append(POSTCHAR);
- }
- String post = postBuilder.toString();
+ StringBuilder postBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ postBuilder.append(POSTCHAR);
+ }
+ String post = postBuilder.toString();
- tmp = pre + s.toLowerCase() + post;
- }
+ tmp = pre + s.toLowerCase() + post;
+ }
- for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
- String gram = tmp.substring(i, i + gramLength);
- grams.add(gram);
- }
- }
+ for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
+ String gram = tmp.substring(i, i + gramLength);
+ grams.add(gram);
+ }
+ }
- @Before
- public void init() throws Exception {
- // serialize string into bytes
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput dos = new DataOutputStream(baos);
- dos.writeUTF(str);
- inputBuffer = baos.toByteArray();
- }
+ @Before
+ public void init() throws Exception {
+ // serialize string into bytes
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput dos = new DataOutputStream(baos);
+ dos.writeUTF(str);
+ inputBuffer = baos.toByteArray();
+ }
- void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost)
- throws IOException {
- HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
- gramLength, prePost, false, false, tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+ void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
+ false, tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
- ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
- HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
- for (String s : expectedGrams) {
- Integer count = gramCounts.get(s);
- if (count == null) {
- count = 1;
- gramCounts.put(s, count);
- } else {
- count++;
- }
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
+ for (String s : expectedGrams) {
+ Integer count = gramCounts.get(s);
+ if (count == null) {
+ count = 1;
+ gramCounts.put(s, count);
+ } else {
+ count++;
+ }
- int hash = tokenHash(s, count);
- expectedHashedGrams.add(hash);
- }
+ int hash = tokenHash(s, count);
+ expectedHashedGrams.add(hash);
+ }
- int tokenCount = 0;
+ int tokenCount = 0;
- while (tokenizer.hasNext()) {
- tokenizer.next();
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenStorage = new GrowableArray();
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenStorage);
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(
- tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
+ DataInput in = new DataInputStream(bais);
- Integer hashedGram = in.readInt();
+ Integer hashedGram = in.readInt();
- // System.out.println(hashedGram);
+ // System.out.println(hashedGram);
- Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
- tokenCount++;
- }
- // System.out.println("---------");
- }
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
- void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost)
- throws IOException {
- HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
- gramLength, prePost, true, false, tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+ void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
- ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
- for (String s : expectedGrams) {
- int hash = tokenHash(s, 1);
- expectedHashedGrams.add(hash);
- }
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ for (String s : expectedGrams) {
+ int hash = tokenHash(s, 1);
+ expectedHashedGrams.add(hash);
+ }
- int tokenCount = 0;
+ int tokenCount = 0;
- while (tokenizer.hasNext()) {
- tokenizer.next();
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenStorage = new GrowableArray();
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenStorage);
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(
- tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
+ DataInput in = new DataInputStream(bais);
- Integer hashedGram = in.readInt();
+ Integer hashedGram = in.readInt();
- // System.out.println(hashedGram);
+ // System.out.println(hashedGram);
- Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
- tokenCount++;
- }
- // System.out.println("---------");
- }
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
- void runTestNGramTokenizerWithUTF8Tokens(boolean prePost)
- throws IOException {
- UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
- gramLength, prePost, true, false, tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+ void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
+ UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
- int tokenCount = 0;
+ int tokenCount = 0;
- while (tokenizer.hasNext()) {
- tokenizer.next();
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenStorage = new GrowableArray();
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenStorage);
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(
- tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
+ DataInput in = new DataInputStream(bais);
- String strGram = in.readUTF();
+ String strGram = in.readUTF();
- // System.out.println("\"" + strGram + "\"");
+ // System.out.println("\"" + strGram + "\"");
- Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
+ Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
- tokenCount++;
- }
- // System.out.println("---------");
- }
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
- @Test
- public void testNGramTokenizerWithCountedHashedUTF8Tokens()
- throws Exception {
- runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
- runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
- }
+ @Test
+ public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
+ }
- @Test
- public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
- runTestNGramTokenizerWithHashedUTF8Tokens(false);
- runTestNGramTokenizerWithHashedUTF8Tokens(true);
- }
+ @Test
+ public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithHashedUTF8Tokens(true);
+ }
- @Test
- public void testNGramTokenizerWithUTF8Tokens() throws IOException {
- runTestNGramTokenizerWithUTF8Tokens(false);
- runTestNGramTokenizerWithUTF8Tokens(true);
- }
+ @Test
+ public void testNGramTokenizerWithUTF8Tokens() throws IOException {
+ runTestNGramTokenizerWithUTF8Tokens(false);
+ runTestNGramTokenizerWithUTF8Tokens(true);
+ }
- public int tokenHash(String token, int tokenCount) {
- int h = AbstractUTF8Token.GOLDEN_RATIO_32;
- for (int i = 0; i < token.length(); i++) {
- h ^= token.charAt(i);
- h *= AbstractUTF8Token.GOLDEN_RATIO_32;
- }
- return h + tokenCount;
- }
+ public int tokenHash(String token, int tokenCount) {
+ int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+ for (int i = 0; i < token.length(); i++) {
+ h ^= token.charAt(i);
+ h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+ }
+ return h + tokenCount;
+ }
}
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java
index c3c9b99..47a068b 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java
@@ -27,6 +27,7 @@
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
import edu.uci.ics.hyracks.data.std.util.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
@@ -112,20 +113,19 @@
}
private class TokenIdPair implements Comparable<TokenIdPair> {
- public ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
- public DataOutputStream dos = new DataOutputStream(baaos);
+ public final GrowableArray tokenStorage = new GrowableArray();
public int id;
TokenIdPair(IToken token, int id) throws IOException {
- token.serializeToken(dos);
+ token.serializeToken(tokenStorage);
this.id = id;
}
@Override
public int compareTo(TokenIdPair o) {
- int cmp = btreeBinCmps[0].compare(baaos.getByteArray(), 0,
- baaos.getByteArray().length, o.baaos.getByteArray(), 0,
- o.baaos.getByteArray().length);
+ int cmp = btreeBinCmps[0].compare(tokenStorage.getByteArray(), 0,
+ tokenStorage.getByteArray().length, o.tokenStorage.getByteArray(), 0,
+ o.tokenStorage.getByteArray().length);
if (cmp == 0) {
return id - o.id;
} else {
@@ -157,8 +157,8 @@
for (TokenIdPair t : pairs) {
tb.reset();
- tb.addField(t.baaos.getByteArray(), 0,
- t.baaos.getByteArray().length);
+ tb.addField(t.tokenStorage.getByteArray(), 0,
+ t.tokenStorage.getByteArray().length);
IntegerSerializerDeserializer.INSTANCE.serialize(t.id, tb.getDataOutput());
tb.addFieldEndOffset();
tuple.reset(tb.getFieldEndOffsets(), tb.getByteArray());
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
index 53fb96d..810c5f5 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
@@ -34,6 +34,7 @@
import org.junit.Before;
import org.junit.Test;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.AbstractUTF8Token;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
@@ -127,14 +128,13 @@
tokenizer.next();
// serialize token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ GrowableArray tokenStorage = new GrowableArray();
IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ token.serializeToken(tokenStorage);
// deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedToken = in.readInt();
@@ -159,14 +159,13 @@
tokenizer.next();
// serialize token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ GrowableArray tokenStorage = new GrowableArray();
IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ token.serializeToken(tokenStorage);
// deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedToken = in.readInt();
@@ -191,14 +190,13 @@
tokenizer.next();
// serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ GrowableArray tokenStorage = new GrowableArray();
IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ token.serializeToken(tokenStorage);
// deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
DataInput in = new DataInputStream(bais);
String strToken = in.readUTF();