Merged -r598:599 from trunk
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_dev_next@719 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
index d11245b..bcaeb22 100644
--- a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
+++ b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
@@ -18,195 +18,95 @@
import java.io.IOException;
public class StringUtils {
- public static char charAt(byte[] b, int s) {
- int c = b[s] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- return (char) c;
+ public static char charAt(byte[] b, int s) {
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return (char) c;
- case 12:
- case 13:
- return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+ case 12:
+ case 13:
+ return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
- case 14:
- return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
+ case 14:
+ return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
- default:
- throw new IllegalArgumentException();
- }
- }
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
- public static int charSize(byte[] b, int s) {
- int c = b[s] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- return 1;
+ public static int charSize(byte[] b, int s) {
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return 1;
- case 12:
- case 13:
- return 2;
+ case 12:
+ case 13:
+ return 2;
- case 14:
- return 3;
- }
- throw new IllegalStateException();
- }
+ case 14:
+ return 3;
+ }
+ throw new IllegalStateException();
+ }
- public static int getModifiedUTF8Len(char c) {
- if (c >= 0x0000 && c <= 0x007F) {
- return 1;
- } else if (c <= 0x07FF) {
- return 2;
- } else {
- return 3;
- }
- }
+ public static int getModifiedUTF8Len(char c) {
+ if (c >= 0x0000 && c <= 0x007F) {
+ return 1;
+ } else if (c <= 0x07FF) {
+ return 2;
+ } else {
+ return 3;
+ }
+ }
- public static int getStrLen(byte[] b, int s) {
- int pos = s + 2;
- int end = pos + getUTFLen(b, s);
- int charCount = 0;
- while (pos < end) {
- charCount++;
- pos += charSize(b, pos);
- }
- return charCount;
- }
+ public static int getStrLen(byte[] b, int s) {
+ int pos = s + 2;
+ int end = pos + getUTFLen(b, s);
+ int charCount = 0;
+ while (pos < end) {
+ charCount++;
+ pos += charSize(b, pos);
+ }
+ return charCount;
+ }
- public static int getUTFLen(byte[] b, int s) {
- return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
- }
+ public static int getUTFLen(byte[] b, int s) {
+ return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
+ }
- public static char toLowerCase(char c) {
- switch (c) {
- case 'A':
- return 'a';
- case 'B':
- return 'b';
- case 'C':
- return 'c';
- case 'D':
- return 'd';
- case 'E':
- return 'e';
- case 'F':
- return 'f';
- case 'G':
- return 'g';
- case 'H':
- return 'h';
- case 'I':
- return 'i';
- case 'J':
- return 'j';
- case 'K':
- return 'k';
- case 'L':
- return 'l';
- case 'M':
- return 'm';
- case 'N':
- return 'n';
- case 'O':
- return 'o';
- case 'P':
- return 'p';
- case 'Q':
- return 'q';
- case 'R':
- return 'r';
- case 'S':
- return 's';
- case 'T':
- return 't';
- case 'U':
- return 'u';
- case 'V':
- return 'v';
- case 'W':
- return 'w';
- case 'X':
- return 'x';
- case 'Y':
- return 'y';
- case 'Z':
- return 'z';
- case 'Ä':
- return 'ä';
- case 'Ǟ':
- return 'ǟ';
- case 'Ë':
- return 'ë';
- case 'Ḧ':
- return 'ḧ';
- case 'Ï':
- return 'ï';
- case 'Ḯ':
- return 'ḯ';
- case 'Ö':
- return 'ö';
- case 'Ȫ':
- return 'ȫ';
- case 'Ṏ':
- return 'ṏ';
- case 'Ü':
- return 'ü';
- case 'Ǖ':
- return 'ǖ';
- case 'Ǘ':
- return 'ǘ';
- case 'Ǚ':
- return 'ǚ';
- case 'Ǜ':
- return 'ǜ';
- case 'Ṳ':
- return 'ṳ';
- case 'Ṻ':
- return 'ṻ';
- case 'Ẅ':
- return 'ẅ';
- case 'Ẍ':
- return 'ẍ';
- case 'Ÿ':
- return 'ÿ';
- default:
- // since I probably missed some chars above
- // use Java to convert to lower case to be safe
- return Character.toLowerCase(c);
- }
- }
+ public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
- public static void writeCharAsModifiedUTF8(char c, DataOutput dos)
- throws IOException {
+ if (c >= 0x0000 && c <= 0x007F) {
+ dos.writeByte(c);
+ } else if (c <= 0x07FF) {
+ dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ } else {
+ dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+ dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ }
+ }
- if (c >= 0x0000 && c <= 0x007F) {
- dos.writeByte(c);
- } else if (c <= 0x07FF) {
- dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
- dos.writeByte((byte) (0x80 | (c & 0x3F)));
- } else {
- dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
- dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
- dos.writeByte((byte) (0x80 | (c & 0x3F)));
- }
- }
-
- public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
- dos.write((len >>> 8) & 0xFF);
- dos.write((len >>> 0) & 0xFF);
- }
+ public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
+ dos.write((len >>> 8) & 0xFF);
+ dos.write((len >>> 0) & 0xFF);
+ }
}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
index 92d6ac2..209c939 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
@@ -16,7 +16,6 @@
*
* Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
-
package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
import java.io.DataOutput;
@@ -25,82 +24,81 @@
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
public abstract class AbstractUTF8Token implements IToken {
- public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+ public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
- protected int length;
- protected int tokenLength;
- protected int start;
- protected int tokenCount;
- protected byte[] data;
- protected final byte tokenTypeTag;
- protected final byte countTypeTag;
+ protected int length;
+ protected int tokenLength;
+ protected int start;
+ protected int tokenCount;
+ protected byte[] data;
+ protected final byte tokenTypeTag;
+ protected final byte countTypeTag;
- public AbstractUTF8Token() {
- tokenTypeTag = -1;
- countTypeTag = -1;
- }
+ public AbstractUTF8Token() {
+ tokenTypeTag = -1;
+ countTypeTag = -1;
+ }
- public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
- this.tokenTypeTag = tokenTypeTag;
- this.countTypeTag = countTypeTag;
- }
+ public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+ this.tokenTypeTag = tokenTypeTag;
+ this.countTypeTag = countTypeTag;
+ }
- @Override
- public byte[] getData() {
- return data;
- }
+ @Override
+ public byte[] getData() {
+ return data;
+ }
- @Override
- public int getLength() {
- return length;
- }
+ @Override
+ public int getLength() {
+ return length;
+ }
- public int getLowerCaseUTF8Len(int size) {
- int lowerCaseUTF8Len = 0;
- int pos = start;
- for (int i = 0; i < size; i++) {
- char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
- pos += StringUtils.charSize(data, pos);
- }
- return lowerCaseUTF8Len;
- }
+ public int getLowerCaseUTF8Len(int size) {
+ int lowerCaseUTF8Len = 0;
+ int pos = start;
+ for (int i = 0; i < size; i++) {
+ char c = Character.toLowerCase(StringUtils.charAt(data, pos));
+ lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+ pos += StringUtils.charSize(data, pos);
+ }
+ return lowerCaseUTF8Len;
+ }
- @Override
- public int getStart() {
- return start;
- }
+ @Override
+ public int getStart() {
+ return start;
+ }
- @Override
- public int getTokenLength() {
- return tokenLength;
- }
+ @Override
+ public int getTokenLength() {
+ return tokenLength;
+ }
- public void handleCountTypeTag(DataOutput dos) throws IOException {
- if (countTypeTag > 0) {
- dos.write(countTypeTag);
- }
- }
+ public void handleCountTypeTag(DataOutput dos) throws IOException {
+ if (countTypeTag > 0) {
+ dos.write(countTypeTag);
+ }
+ }
- public void handleTokenTypeTag(DataOutput dos) throws IOException {
- if (tokenTypeTag > 0) {
- dos.write(tokenTypeTag);
- }
- }
+ public void handleTokenTypeTag(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
+ }
- @Override
- public void reset(byte[] data, int start, int length, int tokenLength,
- int tokenCount) {
- this.data = data;
- this.start = start;
- this.length = length;
- this.tokenLength = tokenLength;
- this.tokenCount = tokenCount;
- }
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+ this.data = data;
+ this.start = start;
+ this.length = length;
+ this.tokenLength = tokenLength;
+ this.tokenCount = tokenCount;
+ }
- @Override
- public void serializeTokenCount(DataOutput dos) throws IOException {
- handleCountTypeTag(dos);
- dos.writeInt(tokenCount);
- }
-}
+ @Override
+ public void serializeTokenCount(DataOutput dos) throws IOException {
+ handleCountTypeTag(dos);
+ dos.writeInt(tokenCount);
+ }
+}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index de9ad2c..83d0c75 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -21,67 +21,61 @@
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-public class DelimitedUTF8StringBinaryTokenizer extends
- AbstractUTF8StringBinaryTokenizer {
+public class DelimitedUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
- public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
- boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
- super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
- }
+ public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+ ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ }
- @Override
- public boolean hasNext() {
- // skip delimiters
- while (index < length && isSeparator(StringUtils.charAt(data, index))) {
- index += StringUtils.charSize(data, index);
- }
- return index < length;
- }
+ @Override
+ public boolean hasNext() {
+ // skip delimiters
+ while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ }
+ return index < length;
+ }
- private boolean isSeparator(char c) {
- return !(Character.isLetterOrDigit(c)
- || Character.getType(c) == Character.OTHER_LETTER || Character
- .getType(c) == Character.OTHER_NUMBER);
- }
+ private boolean isSeparator(char c) {
+ return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+ }
- @Override
- public void next() {
- tokenLength = 0;
- int currentTokenStart = index;
- while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
- index += StringUtils.charSize(data, index);
- tokenLength++;
- }
- int tokenCount = 1;
- if (tokenLength > 0 && !ignoreTokenCount) {
- // search if we got the same token before
- for (int i = 0; i < tokensStart.length(); ++i) {
- if (tokenLength == tokensLength.get(i)) {
- int tokenStart = tokensStart.get(i);
- tokenCount++; // assume we found it
- int offset = 0;
- int currLength = 0;
- while (currLength < tokenLength) {
- // case insensitive comparison
- if (StringUtils.toLowerCase(StringUtils.charAt(data,
- currentTokenStart + offset)) != StringUtils
- .toLowerCase(StringUtils.charAt(data,
- tokenStart + offset))) {
- tokenCount--;
- break;
- }
- offset += StringUtils.charSize(data, currentTokenStart
- + offset);
- currLength++;
- }
- }
- }
- // add the new token to the list of seen tokens
- tokensStart.add(currentTokenStart);
- tokensLength.add(tokenLength);
- }
+ @Override
+ public void next() {
+ tokenLength = 0;
+ int currentTokenStart = index;
+ while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ tokenLength++;
+ }
+ int tokenCount = 1;
+ if (tokenLength > 0 && !ignoreTokenCount) {
+ // search if we got the same token before
+ for (int i = 0; i < tokensStart.length(); ++i) {
+ if (tokenLength == tokensLength.get(i)) {
+ int tokenStart = tokensStart.get(i);
+ tokenCount++; // assume we found it
+ int offset = 0;
+ int currLength = 0;
+ while (currLength < tokenLength) {
+ // case insensitive comparison
+ if (Character.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != Character
+ .toLowerCase(StringUtils.charAt(data, tokenStart + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, currentTokenStart + offset);
+ currLength++;
+ }
+ }
+ }
+ // add the new token to the list of seen tokens
+ tokensStart.add(currentTokenStart);
+ tokensLength.add(tokenLength);
+ }
- // set token
- token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
- }
-}
+ // set token
+ token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+ }
+}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
index 25d1a2c..43f89c7 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
@@ -25,40 +25,40 @@
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
public class HashedUTF8NGramToken extends UTF8NGramToken {
- public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
+ public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
- @Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
- int hash = GOLDEN_RATIO_32;
+ int hash = GOLDEN_RATIO_32;
- // pre chars
- for (int i = 0; i < numPreChars; i++) {
- hash ^= PRECHAR;
- hash *= GOLDEN_RATIO_32;
- }
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ hash ^= PRECHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
- // regular chars
- int numRegGrams = tokenLength - numPreChars - numPostChars;
- int pos = start;
- for (int i = 0; i < numRegGrams; i++) {
- hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- hash *= GOLDEN_RATIO_32;
- pos += StringUtils.charSize(data, pos);
- }
+ // regular chars
+ int numRegGrams = tokenLength - numPreChars - numPostChars;
+ int pos = start;
+ for (int i = 0; i < numRegGrams; i++) {
+ hash ^= Character.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
- // post chars
- for (int i = 0; i < numPostChars; i++) {
- hash ^= POSTCHAR;
- hash *= GOLDEN_RATIO_32;
- }
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ hash ^= POSTCHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
- // token count
- hash += tokenCount;
+ // token count
+ hash += tokenCount;
- dos.writeInt(hash);
- }
+ dos.writeInt(hash);
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
index 55237ce..747b65d 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
@@ -26,63 +26,61 @@
public class HashedUTF8WordToken extends UTF8WordToken {
- private int hash = 0;
+ private int hash = 0;
- public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
+ public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
- @Override
- public boolean equals(Object o) {
- if (o == null) {
- return false;
- }
- if (!(o instanceof IToken)) {
- return false;
- }
- IToken t = (IToken) o;
- if (t.getTokenLength() != tokenLength) {
- return false;
- }
- int offset = 0;
- for (int i = 0; i < tokenLength; i++) {
- if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils
- .charAt(data, start + offset)) {
- return false;
- }
- offset += StringUtils.charSize(data, start + offset);
- }
- return true;
- }
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (!(o instanceof IToken)) {
+ return false;
+ }
+ IToken t = (IToken) o;
+ if (t.getTokenLength() != tokenLength) {
+ return false;
+ }
+ int offset = 0;
+ for (int i = 0; i < tokenLength; i++) {
+ if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils.charAt(data, start + offset)) {
+ return false;
+ }
+ offset += StringUtils.charSize(data, start + offset);
+ }
+ return true;
+ }
- @Override
- public int hashCode() {
- return hash;
- }
+ @Override
+ public int hashCode() {
+ return hash;
+ }
- @Override
- public void reset(byte[] data, int start, int length, int tokenLength,
- int tokenCount) {
- super.reset(data, start, length, tokenLength, tokenCount);
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+ super.reset(data, start, length, tokenLength, tokenCount);
- // pre-compute hash value using JAQL-like string hashing
- int pos = start;
- hash = GOLDEN_RATIO_32;
- for (int i = 0; i < tokenLength; i++) {
- hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- hash *= GOLDEN_RATIO_32;
- pos += StringUtils.charSize(data, pos);
- }
- hash += tokenCount;
- }
+ // pre-compute hash value using JAQL-like string hashing
+ int pos = start;
+ hash = GOLDEN_RATIO_32;
+ for (int i = 0; i < tokenLength; i++) {
+ hash ^= Character.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
+ hash += tokenCount;
+ }
- @Override
- public void serializeToken(DataOutput dos) throws IOException {
- if (tokenTypeTag > 0) {
- dos.write(tokenTypeTag);
- }
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
- // serialize hash value
- dos.writeInt(hash);
- }
+ // serialize hash value
+ dos.writeInt(hash);
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index 2a13f83..bdbf6f8 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -21,103 +21,98 @@
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-public class NGramUTF8StringBinaryTokenizer extends
- AbstractUTF8StringBinaryTokenizer {
+public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
- private int gramLength;
- private boolean usePrePost;
+ private int gramLength;
+ private boolean usePrePost;
- private int gramNum;
- private int totalGrams;
+ private int gramNum;
+ private int totalGrams;
- private final INGramToken concreteToken;
+ private final INGramToken concreteToken;
- public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
- boolean ignoreTokenCount, boolean sourceHasTypeTag,
- ITokenFactory tokenFactory) {
- super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
- this.gramLength = gramLength;
- this.usePrePost = usePrePost;
- concreteToken = (INGramToken) token;
- }
+ public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ this.gramLength = gramLength;
+ this.usePrePost = usePrePost;
+ concreteToken = (INGramToken) token;
+ }
- @Override
- public boolean hasNext() {
- if (gramNum < totalGrams) {
- return true;
- } else {
- return false;
- }
- }
+ @Override
+ public boolean hasNext() {
+ if (gramNum < totalGrams) {
+ return true;
+ } else {
+ return false;
+ }
+ }
- @Override
- public void next() {
- int currentTokenStart = index;
- int tokenCount = 1;
- int numPreChars = 0;
- int numPostChars = 0;
- if (usePrePost) {
- numPreChars = Math.max(gramLength - gramNum - 1, 0);
- numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
- - totalGrams + gramNum : 0;
- }
- gramNum++;
+ @Override
+ public void next() {
+ int currentTokenStart = index;
+ int tokenCount = 1;
+ int numPreChars = 0;
+ int numPostChars = 0;
+ if (usePrePost) {
+ numPreChars = Math.max(gramLength - gramNum - 1, 0);
+ numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0;
+ }
+ gramNum++;
- concreteToken.setNumPrePostChars(numPreChars, numPostChars);
- if (numPreChars == 0) {
- index += StringUtils.charSize(data, index);
- }
+ concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+ if (numPreChars == 0) {
+ index += StringUtils.charSize(data, index);
+ }
- // compute token count
- // ignore pre and post grams for duplicate detection
- if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
- int tmpIndex = start;
- while (tmpIndex < currentTokenStart) {
- tokenCount++; // assume found
- int offset = 0;
- for (int j = 0; j < gramLength; j++) {
- if (StringUtils.toLowerCase(StringUtils.charAt(data,
- currentTokenStart + offset)) != StringUtils
- .toLowerCase(StringUtils.charAt(data, tmpIndex
- + offset))) {
- tokenCount--;
- break;
- }
- offset += StringUtils.charSize(data, tmpIndex + offset);
- }
- tmpIndex += StringUtils.charSize(data, tmpIndex);
- }
- }
+ // compute token count
+ // ignore pre and post grams for duplicate detection
+ if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+ int tmpIndex = start;
+ while (tmpIndex < currentTokenStart) {
+ tokenCount++; // assume found
+ int offset = 0;
+ for (int j = 0; j < gramLength; j++) {
+ if (Character.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != Character
+ .toLowerCase(StringUtils.charAt(data, tmpIndex + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, tmpIndex + offset);
+ }
+ tmpIndex += StringUtils.charSize(data, tmpIndex);
+ }
+ }
- // set token
- token.reset(data, currentTokenStart, length, gramLength, tokenCount);
- }
+ // set token
+ token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+ }
- @Override
- public void reset(byte[] data, int start, int length) {
- super.reset(data, start, length);
- gramNum = 0;
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ super.reset(data, start, length);
+ gramNum = 0;
- int numChars = 0;
- int pos = index;
- int end = pos + utf8Length;
- while (pos < end) {
- numChars++;
- pos += StringUtils.charSize(data, pos);
- }
+ int numChars = 0;
+ int pos = index;
+ int end = pos + utf8Length;
+ while (pos < end) {
+ numChars++;
+ pos += StringUtils.charSize(data, pos);
+ }
- if (usePrePost) {
- totalGrams = numChars + gramLength - 1;
- } else {
- totalGrams = numChars - gramLength + 1;
- }
- }
+ if (usePrePost) {
+ totalGrams = numChars + gramLength - 1;
+ } else {
+ totalGrams = numChars - gramLength + 1;
+ }
+ }
- public void setGramlength(int gramLength) {
- this.gramLength = gramLength;
- }
+ public void setGramlength(int gramLength) {
+ this.gramLength = gramLength;
+ }
- public void setPrePost(boolean usePrePost) {
- this.usePrePost = usePrePost;
- }
+ public void setPrePost(boolean usePrePost) {
+ this.usePrePost = usePrePost;
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
index 6b6406f..1b124dc 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
@@ -26,61 +26,60 @@
public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
- public final static char PRECHAR = '#';
+ public final static char PRECHAR = '#';
- public final static char POSTCHAR = '$';
+ public final static char POSTCHAR = '$';
- protected int numPreChars;
- protected int numPostChars;
+ protected int numPreChars;
+ protected int numPostChars;
- public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
+ public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
- @Override
- public int getNumPostChars() {
- return numPreChars;
- }
+ @Override
+ public int getNumPostChars() {
+ return numPreChars;
+ }
- @Override
- public int getNumPreChars() {
- return numPostChars;
- }
+ @Override
+ public int getNumPreChars() {
+ return numPostChars;
+ }
- @Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
- // regular chars
- int numRegChars = tokenLength - numPreChars - numPostChars;
+ // regular chars
+ int numRegChars = tokenLength - numPreChars - numPostChars;
- // assuming pre and post char need 1-byte each in utf8
- int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars
- + numPostChars;
+ // assuming pre and post char need 1-byte each in utf8
+ int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars;
- // write utf8 length indicator
- StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+ // write utf8 length indicator
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
- // pre chars
- for (int i = 0; i < numPreChars; i++) {
- StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
- }
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+ }
- int pos = start;
- for (int i = 0; i < numRegChars; i++) {
- char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- StringUtils.writeCharAsModifiedUTF8(c, dos);
- pos += StringUtils.charSize(data, pos);
- }
+ int pos = start;
+ for (int i = 0; i < numRegChars; i++) {
+ char c = Character.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
- // post chars
- for (int i = 0; i < numPostChars; i++) {
- StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
- }
- }
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+ }
+ }
- public void setNumPrePostChars(int numPreChars, int numPostChars) {
- this.numPreChars = numPreChars;
- this.numPostChars = numPostChars;
- }
+ public void setNumPrePostChars(int numPreChars, int numPostChars) {
+ this.numPreChars = numPreChars;
+ this.numPostChars = numPostChars;
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
index 25e0cd3..2a74145 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
@@ -26,21 +26,21 @@
public class UTF8WordToken extends AbstractUTF8Token {
- public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
+ public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
- @Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
- int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
- StringUtils.writeUTF8Len(tokenUTF8Len, dos);
- int pos = start;
- for (int i = 0; i < tokenLength; i++) {
- char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- StringUtils.writeCharAsModifiedUTF8(c, dos);
- pos += StringUtils.charSize(data, pos);
- }
- }
+ int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+ int pos = start;
+ for (int i = 0; i < tokenLength; i++) {
+ char c = Character.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
+ }
}