Merged -r598:599 from trunk git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_dev_next@719 123451ca-8445-de46-9d55-352943316053

commit: 44050ae455316f42d85c8cb4ab060b604e929a17 [log] [tgz]
author: vinayakb <vinayakb@123451ca-8445-de46-9d55-352943316053> Tue Nov 01 20:21:21 2011 +0000
committer: vinayakb <vinayakb@123451ca-8445-de46-9d55-352943316053> Tue Nov 01 20:21:21 2011 +0000
tree: 9cdc78bf1fc4d7007bfe50679891c75c7d570f86
parent: 79c5585f9ec7aca955d8a6d0821b7db7c2b7f68f [diff]
diff --git a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
index d11245b..bcaeb22 100644
--- a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
+++ b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java

@@ -18,195 +18,95 @@
 import java.io.IOException;
 
 public class StringUtils {
-	public static char charAt(byte[] b, int s) {
-		int c = b[s] & 0xff;
-		switch (c >> 4) {
-		case 0:
-		case 1:
-		case 2:
-		case 3:
-		case 4:
-		case 5:
-		case 6:
-		case 7:
-			return (char) c;
+    public static char charAt(byte[] b, int s) {
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return (char) c;
 
-		case 12:
-		case 13:
-			return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+            case 12:
+            case 13:
+                return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
 
-		case 14:
-			return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
+            case 14:
+                return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
 
-		default:
-			throw new IllegalArgumentException();
-		}
-	}
+            default:
+                throw new IllegalArgumentException();
+        }
+    }
 
-	public static int charSize(byte[] b, int s) {
-		int c = b[s] & 0xff;
-		switch (c >> 4) {
-		case 0:
-		case 1:
-		case 2:
-		case 3:
-		case 4:
-		case 5:
-		case 6:
-		case 7:
-			return 1;
+    public static int charSize(byte[] b, int s) {
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return 1;
 
-		case 12:
-		case 13:
-			return 2;
+            case 12:
+            case 13:
+                return 2;
 
-		case 14:
-			return 3;
-		}
-		throw new IllegalStateException();
-	}
+            case 14:
+                return 3;
+        }
+        throw new IllegalStateException();
+    }
 
-	public static int getModifiedUTF8Len(char c) {
-		if (c >= 0x0000 && c <= 0x007F) {
-			return 1;
-		} else if (c <= 0x07FF) {
-			return 2;
-		} else {
-			return 3;
-		}
-	}
+    public static int getModifiedUTF8Len(char c) {
+        if (c >= 0x0000 && c <= 0x007F) {
+            return 1;
+        } else if (c <= 0x07FF) {
+            return 2;
+        } else {
+            return 3;
+        }
+    }
 
-	public static int getStrLen(byte[] b, int s) {
-		int pos = s + 2;
-		int end = pos + getUTFLen(b, s);
-		int charCount = 0;
-		while (pos < end) {
-			charCount++;
-			pos += charSize(b, pos);
-		}
-		return charCount;
-	}
+    public static int getStrLen(byte[] b, int s) {
+        int pos = s + 2;
+        int end = pos + getUTFLen(b, s);
+        int charCount = 0;
+        while (pos < end) {
+            charCount++;
+            pos += charSize(b, pos);
+        }
+        return charCount;
+    }
 
-	public static int getUTFLen(byte[] b, int s) {
-		return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
-	}
+    public static int getUTFLen(byte[] b, int s) {
+        return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
+    }
 
-	public static char toLowerCase(char c) {
-		switch (c) {
-		case 'A':
-			return 'a';
-		case 'B':
-			return 'b';
-		case 'C':
-			return 'c';
-		case 'D':
-			return 'd';
-		case 'E':
-			return 'e';
-		case 'F':
-			return 'f';
-		case 'G':
-			return 'g';
-		case 'H':
-			return 'h';
-		case 'I':
-			return 'i';
-		case 'J':
-			return 'j';
-		case 'K':
-			return 'k';
-		case 'L':
-			return 'l';
-		case 'M':
-			return 'm';
-		case 'N':
-			return 'n';
-		case 'O':
-			return 'o';
-		case 'P':
-			return 'p';
-		case 'Q':
-			return 'q';
-		case 'R':
-			return 'r';
-		case 'S':
-			return 's';
-		case 'T':
-			return 't';
-		case 'U':
-			return 'u';
-		case 'V':
-			return 'v';
-		case 'W':
-			return 'w';
-		case 'X':
-			return 'x';
-		case 'Y':
-			return 'y';
-		case 'Z':
-			return 'z';
-		case 'Ä':
-			return 'ä';
-		case 'Ǟ':
-			return 'ǟ';
-		case 'Ë':
-			return 'ë';
-		case 'Ḧ':
-			return 'ḧ';
-		case 'Ï':
-			return 'ï';
-		case 'Ḯ':
-			return 'ḯ';
-		case 'Ö':
-			return 'ö';
-		case 'Ȫ':
-			return 'ȫ';
-		case 'Ṏ':
-			return 'ṏ';
-		case 'Ü':
-			return 'ü';
-		case 'Ǖ':
-			return 'ǖ';
-		case 'Ǘ':
-			return 'ǘ';
-		case 'Ǚ':
-			return 'ǚ';
-		case 'Ǜ':
-			return 'ǜ';
-		case 'Ṳ':
-			return 'ṳ';
-		case 'Ṻ':
-			return 'ṻ';
-		case 'Ẅ':
-			return 'ẅ';
-		case 'Ẍ':
-			return 'ẍ';
-		case 'Ÿ':
-			return 'ÿ';
-		default:
-			// since I probably missed some chars above
-			// use Java to convert to lower case to be safe
-			return Character.toLowerCase(c);
-		}
-	}
+    public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
 
-	public static void writeCharAsModifiedUTF8(char c, DataOutput dos)
-			throws IOException {
+        if (c >= 0x0000 && c <= 0x007F) {
+            dos.writeByte(c);
+        } else if (c <= 0x07FF) {
+            dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+        } else {
+            dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+            dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+        }
+    }
 
-		if (c >= 0x0000 && c <= 0x007F) {
-			dos.writeByte(c);
-		} else if (c <= 0x07FF) {
-			dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
-			dos.writeByte((byte) (0x80 | (c & 0x3F)));
-		} else {
-			dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
-			dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
-			dos.writeByte((byte) (0x80 | (c & 0x3F)));
-		}
-	}
-
-	public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
-		dos.write((len >>> 8) & 0xFF);
-		dos.write((len >>> 0) & 0xFF);
-	}
+    public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
+        dos.write((len >>> 8) & 0xFF);
+        dos.write((len >>> 0) & 0xFF);
+    }
 }
\ No newline at end of file

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
index 92d6ac2..209c939 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java

@@ -16,7 +16,6 @@
  * 
  * Author: Alexander Behm <abehm (at) ics.uci.edu>
  */
-
 package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
 
 import java.io.DataOutput;
@@ -25,82 +24,81 @@
 import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
 
 public abstract class AbstractUTF8Token implements IToken {
-	public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+    public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
 
-	protected int length;
-	protected int tokenLength;
-	protected int start;
-	protected int tokenCount;
-	protected byte[] data;
-	protected final byte tokenTypeTag;
-	protected final byte countTypeTag;
+    protected int length;
+    protected int tokenLength;
+    protected int start;
+    protected int tokenCount;
+    protected byte[] data;
+    protected final byte tokenTypeTag;
+    protected final byte countTypeTag;
 
-	public AbstractUTF8Token() {
-		tokenTypeTag = -1;
-		countTypeTag = -1;
-	}
+    public AbstractUTF8Token() {
+        tokenTypeTag = -1;
+        countTypeTag = -1;
+    }
 
-	public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
-		this.tokenTypeTag = tokenTypeTag;
-		this.countTypeTag = countTypeTag;
-	}
+    public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+        this.tokenTypeTag = tokenTypeTag;
+        this.countTypeTag = countTypeTag;
+    }
 
-	@Override
-	public byte[] getData() {
-		return data;
-	}
+    @Override
+    public byte[] getData() {
+        return data;
+    }
 
-	@Override
-	public int getLength() {
-		return length;
-	}
+    @Override
+    public int getLength() {
+        return length;
+    }
 
-	public int getLowerCaseUTF8Len(int size) {
-		int lowerCaseUTF8Len = 0;
-		int pos = start;
-		for (int i = 0; i < size; i++) {
-			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-			lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
-			pos += StringUtils.charSize(data, pos);
-		}
-		return lowerCaseUTF8Len;
-	}
+    public int getLowerCaseUTF8Len(int size) {
+        int lowerCaseUTF8Len = 0;
+        int pos = start;
+        for (int i = 0; i < size; i++) {
+            char c = Character.toLowerCase(StringUtils.charAt(data, pos));
+            lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+            pos += StringUtils.charSize(data, pos);
+        }
+        return lowerCaseUTF8Len;
+    }
 
-	@Override
-	public int getStart() {
-		return start;
-	}
+    @Override
+    public int getStart() {
+        return start;
+    }
 
-	@Override
-	public int getTokenLength() {
-		return tokenLength;
-	}
+    @Override
+    public int getTokenLength() {
+        return tokenLength;
+    }
 
-	public void handleCountTypeTag(DataOutput dos) throws IOException {
-		if (countTypeTag > 0) {
-			dos.write(countTypeTag);
-		}
-	}
+    public void handleCountTypeTag(DataOutput dos) throws IOException {
+        if (countTypeTag > 0) {
+            dos.write(countTypeTag);
+        }
+    }
 
-	public void handleTokenTypeTag(DataOutput dos) throws IOException {
-		if (tokenTypeTag > 0) {
-			dos.write(tokenTypeTag);
-		}
-	}
+    public void handleTokenTypeTag(DataOutput dos) throws IOException {
+        if (tokenTypeTag > 0) {
+            dos.write(tokenTypeTag);
+        }
+    }
 
-	@Override
-	public void reset(byte[] data, int start, int length, int tokenLength,
-			int tokenCount) {
-		this.data = data;
-		this.start = start;
-		this.length = length;
-		this.tokenLength = tokenLength;
-		this.tokenCount = tokenCount;
-	}
+    @Override
+    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+        this.data = data;
+        this.start = start;
+        this.length = length;
+        this.tokenLength = tokenLength;
+        this.tokenCount = tokenCount;
+    }
 
-	@Override
-	public void serializeTokenCount(DataOutput dos) throws IOException {
-		handleCountTypeTag(dos);
-		dos.writeInt(tokenCount);
-	}
-}
+    @Override
+    public void serializeTokenCount(DataOutput dos) throws IOException {
+        handleCountTypeTag(dos);
+        dos.writeInt(tokenCount);
+    }
+}
\ No newline at end of file

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index de9ad2c..83d0c75 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java

@@ -21,67 +21,61 @@
 
 import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
 
-public class DelimitedUTF8StringBinaryTokenizer extends
-		AbstractUTF8StringBinaryTokenizer {
+public class DelimitedUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
 
-	public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
-			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
-		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
-	}
+    public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+            ITokenFactory tokenFactory) {
+        super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+    }
 
-	@Override
-	public boolean hasNext() {
-		// skip delimiters
-		while (index < length && isSeparator(StringUtils.charAt(data, index))) {
-			index += StringUtils.charSize(data, index);
-		}
-		return index < length;
-	}
+    @Override
+    public boolean hasNext() {
+        // skip delimiters
+        while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+            index += StringUtils.charSize(data, index);
+        }
+        return index < length;
+    }
 
-	private boolean isSeparator(char c) {
-		return !(Character.isLetterOrDigit(c)
-				|| Character.getType(c) == Character.OTHER_LETTER || Character
-				.getType(c) == Character.OTHER_NUMBER);
-	}
+    private boolean isSeparator(char c) {
+        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+    }
 
-	@Override
-	public void next() {
-		tokenLength = 0;
-		int currentTokenStart = index;
-		while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
-			index += StringUtils.charSize(data, index);
-			tokenLength++;
-		}
-		int tokenCount = 1;
-		if (tokenLength > 0 && !ignoreTokenCount) {
-			// search if we got the same token before
-			for (int i = 0; i < tokensStart.length(); ++i) {
-				if (tokenLength == tokensLength.get(i)) {
-					int tokenStart = tokensStart.get(i);
-					tokenCount++; // assume we found it
-					int offset = 0;
-					int currLength = 0;
-					while (currLength < tokenLength) {
-						// case insensitive comparison
-						if (StringUtils.toLowerCase(StringUtils.charAt(data,
-								currentTokenStart + offset)) != StringUtils
-								.toLowerCase(StringUtils.charAt(data,
-										tokenStart + offset))) {
-							tokenCount--;
-							break;
-						}
-						offset += StringUtils.charSize(data, currentTokenStart
-								+ offset);
-						currLength++;
-					}
-				}
-			}
-			// add the new token to the list of seen tokens
-			tokensStart.add(currentTokenStart);
-			tokensLength.add(tokenLength);
-		}
+    @Override
+    public void next() {
+        tokenLength = 0;
+        int currentTokenStart = index;
+        while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+            index += StringUtils.charSize(data, index);
+            tokenLength++;
+        }
+        int tokenCount = 1;
+        if (tokenLength > 0 && !ignoreTokenCount) {
+            // search if we got the same token before
+            for (int i = 0; i < tokensStart.length(); ++i) {
+                if (tokenLength == tokensLength.get(i)) {
+                    int tokenStart = tokensStart.get(i);
+                    tokenCount++; // assume we found it
+                    int offset = 0;
+                    int currLength = 0;
+                    while (currLength < tokenLength) {
+                        // case insensitive comparison
+                        if (Character.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != Character
+                                .toLowerCase(StringUtils.charAt(data, tokenStart + offset))) {
+                            tokenCount--;
+                            break;
+                        }
+                        offset += StringUtils.charSize(data, currentTokenStart + offset);
+                        currLength++;
+                    }
+                }
+            }
+            // add the new token to the list of seen tokens
+            tokensStart.add(currentTokenStart);
+            tokensLength.add(tokenLength);
+        }
 
-		// set token
-		token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
-	}
-}
+        // set token
+        token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+    }
+}
\ No newline at end of file

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
index 25d1a2c..43f89c7 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java

@@ -25,40 +25,40 @@
 import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
 
 public class HashedUTF8NGramToken extends UTF8NGramToken {
-	public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
-		super(tokenTypeTag, countTypeTag);
-	}
+    public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
 
-	@Override
-	public void serializeToken(DataOutput dos) throws IOException {
-		handleTokenTypeTag(dos);
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        handleTokenTypeTag(dos);
 
-		int hash = GOLDEN_RATIO_32;
+        int hash = GOLDEN_RATIO_32;
 
-		// pre chars
-		for (int i = 0; i < numPreChars; i++) {
-			hash ^= PRECHAR;
-			hash *= GOLDEN_RATIO_32;
-		}
+        // pre chars
+        for (int i = 0; i < numPreChars; i++) {
+            hash ^= PRECHAR;
+            hash *= GOLDEN_RATIO_32;
+        }
 
-		// regular chars
-		int numRegGrams = tokenLength - numPreChars - numPostChars;
-		int pos = start;
-		for (int i = 0; i < numRegGrams; i++) {
-			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-			hash *= GOLDEN_RATIO_32;
-			pos += StringUtils.charSize(data, pos);
-		}
+        // regular chars
+        int numRegGrams = tokenLength - numPreChars - numPostChars;
+        int pos = start;
+        for (int i = 0; i < numRegGrams; i++) {
+            hash ^= Character.toLowerCase(StringUtils.charAt(data, pos));
+            hash *= GOLDEN_RATIO_32;
+            pos += StringUtils.charSize(data, pos);
+        }
 
-		// post chars
-		for (int i = 0; i < numPostChars; i++) {
-			hash ^= POSTCHAR;
-			hash *= GOLDEN_RATIO_32;
-		}
+        // post chars
+        for (int i = 0; i < numPostChars; i++) {
+            hash ^= POSTCHAR;
+            hash *= GOLDEN_RATIO_32;
+        }
 
-		// token count
-		hash += tokenCount;
+        // token count
+        hash += tokenCount;
 
-		dos.writeInt(hash);
-	}
+        dos.writeInt(hash);
+    }
 }

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
index 55237ce..747b65d 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java

@@ -26,63 +26,61 @@
 
 public class HashedUTF8WordToken extends UTF8WordToken {
 
-	private int hash = 0;
+    private int hash = 0;
 
-	public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
-		super(tokenTypeTag, countTypeTag);
-	}
+    public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
 
-	@Override
-	public boolean equals(Object o) {
-		if (o == null) {
-			return false;
-		}
-		if (!(o instanceof IToken)) {
-			return false;
-		}
-		IToken t = (IToken) o;
-		if (t.getTokenLength() != tokenLength) {
-			return false;
-		}
-		int offset = 0;
-		for (int i = 0; i < tokenLength; i++) {
-			if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils
-					.charAt(data, start + offset)) {
-				return false;
-			}
-			offset += StringUtils.charSize(data, start + offset);
-		}
-		return true;
-	}
+    @Override
+    public boolean equals(Object o) {
+        if (o == null) {
+            return false;
+        }
+        if (!(o instanceof IToken)) {
+            return false;
+        }
+        IToken t = (IToken) o;
+        if (t.getTokenLength() != tokenLength) {
+            return false;
+        }
+        int offset = 0;
+        for (int i = 0; i < tokenLength; i++) {
+            if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils.charAt(data, start + offset)) {
+                return false;
+            }
+            offset += StringUtils.charSize(data, start + offset);
+        }
+        return true;
+    }
 
-	@Override
-	public int hashCode() {
-		return hash;
-	}
+    @Override
+    public int hashCode() {
+        return hash;
+    }
 
-	@Override
-	public void reset(byte[] data, int start, int length, int tokenLength,
-			int tokenCount) {
-		super.reset(data, start, length, tokenLength, tokenCount);
+    @Override
+    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+        super.reset(data, start, length, tokenLength, tokenCount);
 
-		// pre-compute hash value using JAQL-like string hashing
-		int pos = start;
-		hash = GOLDEN_RATIO_32;
-		for (int i = 0; i < tokenLength; i++) {
-			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-			hash *= GOLDEN_RATIO_32;
-			pos += StringUtils.charSize(data, pos);
-		}
-		hash += tokenCount;
-	}
+        // pre-compute hash value using JAQL-like string hashing
+        int pos = start;
+        hash = GOLDEN_RATIO_32;
+        for (int i = 0; i < tokenLength; i++) {
+            hash ^= Character.toLowerCase(StringUtils.charAt(data, pos));
+            hash *= GOLDEN_RATIO_32;
+            pos += StringUtils.charSize(data, pos);
+        }
+        hash += tokenCount;
+    }
 
-	@Override
-	public void serializeToken(DataOutput dos) throws IOException {
-		if (tokenTypeTag > 0) {
-			dos.write(tokenTypeTag);
-		}
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        if (tokenTypeTag > 0) {
+            dos.write(tokenTypeTag);
+        }
 
-		// serialize hash value
-		dos.writeInt(hash);
-	}
+        // serialize hash value
+        dos.writeInt(hash);
+    }
 }

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index 2a13f83..bdbf6f8 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java

@@ -21,103 +21,98 @@
 
 import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
 
-public class NGramUTF8StringBinaryTokenizer extends
-		AbstractUTF8StringBinaryTokenizer {
+public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
 
-	private int gramLength;
-	private boolean usePrePost;
+    private int gramLength;
+    private boolean usePrePost;
 
-	private int gramNum;
-	private int totalGrams;
+    private int gramNum;
+    private int totalGrams;
 
-	private final INGramToken concreteToken;
+    private final INGramToken concreteToken;
 
-	public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
-			boolean ignoreTokenCount, boolean sourceHasTypeTag,
-			ITokenFactory tokenFactory) {
-		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
-		this.gramLength = gramLength;
-		this.usePrePost = usePrePost;
-		concreteToken = (INGramToken) token;
-	}
+    public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
+            boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+        super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+        this.gramLength = gramLength;
+        this.usePrePost = usePrePost;
+        concreteToken = (INGramToken) token;
+    }
 
-	@Override
-	public boolean hasNext() {
-		if (gramNum < totalGrams) {
-			return true;
-		} else {
-			return false;
-		}
-	}
+    @Override
+    public boolean hasNext() {
+        if (gramNum < totalGrams) {
+            return true;
+        } else {
+            return false;
+        }
+    }
 
-	@Override
-	public void next() {
-		int currentTokenStart = index;
-		int tokenCount = 1;
-		int numPreChars = 0;
-		int numPostChars = 0;
-		if (usePrePost) {
-			numPreChars = Math.max(gramLength - gramNum - 1, 0);
-			numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
-					- totalGrams + gramNum : 0;
-		}
-		gramNum++;
+    @Override
+    public void next() {
+        int currentTokenStart = index;
+        int tokenCount = 1;
+        int numPreChars = 0;
+        int numPostChars = 0;
+        if (usePrePost) {
+            numPreChars = Math.max(gramLength - gramNum - 1, 0);
+            numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0;
+        }
+        gramNum++;
 
-		concreteToken.setNumPrePostChars(numPreChars, numPostChars);
-		if (numPreChars == 0) {
-			index += StringUtils.charSize(data, index);
-		}
+        concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+        if (numPreChars == 0) {
+            index += StringUtils.charSize(data, index);
+        }
 
-		// compute token count
-		// ignore pre and post grams for duplicate detection
-		if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
-			int tmpIndex = start;
-			while (tmpIndex < currentTokenStart) {
-				tokenCount++; // assume found
-				int offset = 0;
-				for (int j = 0; j < gramLength; j++) {
-					if (StringUtils.toLowerCase(StringUtils.charAt(data,
-							currentTokenStart + offset)) != StringUtils
-							.toLowerCase(StringUtils.charAt(data, tmpIndex
-									+ offset))) {
-						tokenCount--;
-						break;
-					}
-					offset += StringUtils.charSize(data, tmpIndex + offset);
-				}
-				tmpIndex += StringUtils.charSize(data, tmpIndex);
-			}
-		}
+        // compute token count
+        // ignore pre and post grams for duplicate detection
+        if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+            int tmpIndex = start;
+            while (tmpIndex < currentTokenStart) {
+                tokenCount++; // assume found
+                int offset = 0;
+                for (int j = 0; j < gramLength; j++) {
+                    if (Character.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != Character
+                            .toLowerCase(StringUtils.charAt(data, tmpIndex + offset))) {
+                        tokenCount--;
+                        break;
+                    }
+                    offset += StringUtils.charSize(data, tmpIndex + offset);
+                }
+                tmpIndex += StringUtils.charSize(data, tmpIndex);
+            }
+        }
 
-		// set token
-		token.reset(data, currentTokenStart, length, gramLength, tokenCount);
-	}
+        // set token
+        token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+    }
 
-	@Override
-	public void reset(byte[] data, int start, int length) {
-		super.reset(data, start, length);
-		gramNum = 0;
+    @Override
+    public void reset(byte[] data, int start, int length) {
+        super.reset(data, start, length);
+        gramNum = 0;
 
-		int numChars = 0;
-		int pos = index;
-		int end = pos + utf8Length;
-		while (pos < end) {
-			numChars++;
-			pos += StringUtils.charSize(data, pos);
-		}
+        int numChars = 0;
+        int pos = index;
+        int end = pos + utf8Length;
+        while (pos < end) {
+            numChars++;
+            pos += StringUtils.charSize(data, pos);
+        }
 
-		if (usePrePost) {
-			totalGrams = numChars + gramLength - 1;
-		} else {
-			totalGrams = numChars - gramLength + 1;
-		}
-	}
+        if (usePrePost) {
+            totalGrams = numChars + gramLength - 1;
+        } else {
+            totalGrams = numChars - gramLength + 1;
+        }
+    }
 
-	public void setGramlength(int gramLength) {
-		this.gramLength = gramLength;
-	}
+    public void setGramlength(int gramLength) {
+        this.gramLength = gramLength;
+    }
 
-	public void setPrePost(boolean usePrePost) {
-		this.usePrePost = usePrePost;
-	}
+    public void setPrePost(boolean usePrePost) {
+        this.usePrePost = usePrePost;
+    }
 }

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
index 6b6406f..1b124dc 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java

@@ -26,61 +26,60 @@
 
 public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
 
-	public final static char PRECHAR = '#';
+    public final static char PRECHAR = '#';
 
-	public final static char POSTCHAR = '$';
+    public final static char POSTCHAR = '$';
 
-	protected int numPreChars;
-	protected int numPostChars;
+    protected int numPreChars;
+    protected int numPostChars;
 
-	public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
-		super(tokenTypeTag, countTypeTag);
-	}
+    public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
 
-	@Override
-	public int getNumPostChars() {
-		return numPreChars;
-	}
+    @Override
+    public int getNumPostChars() {
+        return numPreChars;
+    }
 
-	@Override
-	public int getNumPreChars() {
-		return numPostChars;
-	}
+    @Override
+    public int getNumPreChars() {
+        return numPostChars;
+    }
 
-	@Override
-	public void serializeToken(DataOutput dos) throws IOException {
-		handleTokenTypeTag(dos);
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        handleTokenTypeTag(dos);
 
-		// regular chars
-		int numRegChars = tokenLength - numPreChars - numPostChars;
+        // regular chars
+        int numRegChars = tokenLength - numPreChars - numPostChars;
 
-		// assuming pre and post char need 1-byte each in utf8
-		int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars
-				+ numPostChars;
+        // assuming pre and post char need 1-byte each in utf8
+        int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars;
 
-		// write utf8 length indicator
-		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+        // write utf8 length indicator
+        StringUtils.writeUTF8Len(tokenUTF8Len, dos);
 
-		// pre chars
-		for (int i = 0; i < numPreChars; i++) {
-			StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
-		}
+        // pre chars
+        for (int i = 0; i < numPreChars; i++) {
+            StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+        }
 
-		int pos = start;
-		for (int i = 0; i < numRegChars; i++) {
-			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-			StringUtils.writeCharAsModifiedUTF8(c, dos);
-			pos += StringUtils.charSize(data, pos);
-		}
+        int pos = start;
+        for (int i = 0; i < numRegChars; i++) {
+            char c = Character.toLowerCase(StringUtils.charAt(data, pos));
+            StringUtils.writeCharAsModifiedUTF8(c, dos);
+            pos += StringUtils.charSize(data, pos);
+        }
 
-		// post chars
-		for (int i = 0; i < numPostChars; i++) {
-			StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
-		}
-	}
+        // post chars
+        for (int i = 0; i < numPostChars; i++) {
+            StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+        }
+    }
 
-	public void setNumPrePostChars(int numPreChars, int numPostChars) {
-		this.numPreChars = numPreChars;
-		this.numPostChars = numPostChars;
-	}
+    public void setNumPrePostChars(int numPreChars, int numPostChars) {
+        this.numPreChars = numPreChars;
+        this.numPostChars = numPostChars;
+    }
 }

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
index 25e0cd3..2a74145 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java

@@ -26,21 +26,21 @@
 
 public class UTF8WordToken extends AbstractUTF8Token {
 
-	public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
-		super(tokenTypeTag, countTypeTag);
-	}
+    public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
 
-	@Override
-	public void serializeToken(DataOutput dos) throws IOException {
-		handleTokenTypeTag(dos);
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        handleTokenTypeTag(dos);
 
-		int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
-		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
-		int pos = start;
-		for (int i = 0; i < tokenLength; i++) {
-			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
-			StringUtils.writeCharAsModifiedUTF8(c, dos);
-			pos += StringUtils.charSize(data, pos);
-		}
-	}
+        int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+        StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+        int pos = start;
+        for (int i = 0; i < tokenLength; i++) {
+            char c = Character.toLowerCase(StringUtils.charAt(data, pos));
+            StringUtils.writeCharAsModifiedUTF8(c, dos);
+            pos += StringUtils.charSize(data, pos);
+        }
+    }
 }
commit	44050ae455316f42d85c8cb4ab060b604e929a17	[log] [tgz]
author	vinayakb <vinayakb@123451ca-8445-de46-9d55-352943316053>	Tue Nov 01 20:21:21 2011 +0000
committer	vinayakb <vinayakb@123451ca-8445-de46-9d55-352943316053>	Tue Nov 01 20:21:21 2011 +0000
tree	9cdc78bf1fc4d7007bfe50679891c75c7d570f86
parent	79c5585f9ec7aca955d8a6d0821b7db7c2b7f68f [diff]