[NO ISSUE] Remove out-of-date tokenizer
The string-based Tokenizer should be replaced with the array-based
IBinaryTokenizer. The Tokenizer is not used in the codebase in a
meaningful way, so let's remove it to make things clear.
Change-Id: I483604bf2a5e20c18f6224ac2a153667828dabfb
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5763
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Ian Maxon <imaxon@uci.edu>
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
deleted file mode 100644
index 8be6f0c..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
-
-public class FuzzyJoinAppendLength {
- public static void main(String args[]) throws IOException {
- final String inputFileName = args[0];
- final String outputFileName = args[1];
-
- BufferedReader input = new BufferedReader(new FileReader(inputFileName));
- BufferedWriter output = new BufferedWriter(new FileWriter(outputFileName));
-
- Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
- FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
-
- int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
-
- String line;
- while ((line = input.readLine()) != null) {
- String[] splits = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
- Collection<String> tokens =
- tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
- output.write(splits[0] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[1] + FuzzyJoinConfig.RECORD_SEPARATOR
- + splits[2] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[3] + FuzzyJoinConfig.RECORD_SEPARATOR
- + tokens.size() + "\n");
- }
-
- input.close();
- output.close();
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java
deleted file mode 100644
index 4c85f25..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin;
-
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
-import org.apache.asterix.fuzzyjoin.tokenorder.TokenLoad;
-import org.apache.asterix.fuzzyjoin.tokenorder.TokenRank;
-import org.apache.asterix.fuzzyjoin.tokenorder.TokenRankFrequency;
-
-public class FuzzyJoinTokenize {
- public static class TokenCount implements Comparable<Object> {
- public String token;
- public MutableInteger count;
-
- public TokenCount(String token, MutableInteger count) {
- this.token = token;
- this.count = count;
- }
-
- @Override
- public int compareTo(Object o) {
- TokenCount tc = (TokenCount) o;
- return count.compareTo(tc.count);
- }
-
- public String getToken() {
- return token;
- }
-
- @Override
- public String toString() {
- return token + " " + count;
- }
- }
-
- public static void main(String args[]) throws IOException {
- final String inputFileName = args[0];
- final String tokensFileName = args[1];
- final String tokenizedFileName = args[2];
-
- BufferedReader input = new BufferedReader(new FileReader(inputFileName));
-
- Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
- FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
-
- int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
-
- String line;
- HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>();
- while ((line = input.readLine()) != null) {
- Collection<String> tokens =
- tokenizer.tokenize(FuzzyJoinUtil.getData(line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
- dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
-
- for (String token : tokens) {
- MutableInteger count = tokenCount.get(token);
- if (count == null) {
- tokenCount.put(token, new MutableInteger(1));
- } else {
- count.inc();
- }
- }
- }
-
- input.close();
-
- ArrayList<TokenCount> tokenCounts = new ArrayList<TokenCount>();
- tokenCount.forEach((key, value) -> tokenCounts.add(new TokenCount(key, value)));
- Collections.sort(tokenCounts);
-
- BufferedWriter outputTokens = new BufferedWriter(new FileWriter(tokensFileName));
- for (TokenCount tc : tokenCounts) {
- outputTokens.write(tc.getToken() + "\n");
- }
- outputTokens.close();
-
- TokenRank tokenRank = new TokenRankFrequency();
- TokenLoad tokenLoad = new TokenLoad(tokensFileName, tokenRank);
- tokenLoad.loadTokenRank();
-
- input = new BufferedReader(new FileReader(inputFileName));
- LittleEndianIntOutputStream outputTokenized =
- new LittleEndianIntOutputStream(new BufferedOutputStream(new FileOutputStream(tokenizedFileName)));
- while ((line = input.readLine()) != null) {
- String splits[] = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
- int rid = Integer.parseInt(splits[FuzzyJoinConfig.RECORD_KEY]);
- outputTokenized.writeInt(rid);
- Collection<String> tokens =
- tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
- Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
- outputTokenized.writeInt(tokensRanked.size());
- for (Integer token : tokensRanked) {
- outputTokenized.writeInt(token);
- }
- // for (int i = 0; i < tokens.size() - tokensRanked.size(); i++) {
- // outputTokenized.writeInt(Integer.MAX_VALUE);
- // }
- }
-
- input.close();
- outputTokenized.close();
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index 3348d4c..1133246 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -19,7 +19,6 @@
package org.apache.asterix.fuzzyjoin.similarity;
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.data.std.util.ISequenceIterator;
@@ -118,7 +117,4 @@
public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
- public abstract float getSimilarity(int[] tokensX, int[] tokensY);
-
- public abstract float getSimilarity(String stringX, String stringY, Tokenizer tokenizer);
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index 63d3077..f72400f 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -22,7 +22,6 @@
import java.util.Set;
import java.util.TreeSet;
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.data.std.util.ISequenceIterator;
@@ -82,24 +81,4 @@
return (float) intersectionSize / (totalSize - intersectionSize);
}
- @Override
- public float getSimilarity(int[] tokensX, int[] tokensY) {
- return getSimilarity(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length);
- }
-
- @Override
- public float getSimilarity(String stringX, String stringY, Tokenizer tokenizer) {
- Set<String> setX = new TreeSet<String>();
- for (String token : tokenizer.tokenize(stringX)) {
- setX.add(token);
- }
- Set<String> setY = new TreeSet<String>();
- for (String token : tokenizer.tokenize(stringY)) {
- setY.add(token);
- }
- int lengthX = setX.size();
- int lengthY = setY.size();
- setX.retainAll(setY);
- return ((float) setX.size()) / (lengthX + lengthY - setX.size());
- }
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
deleted file mode 100644
index 5594e43..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-public class NGramTokenizer implements Tokenizer {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- public static void main(String args[]) {
- Tokenizer tokenizer = new NGramTokenizer();
- String a = "hadoopoop";
- System.out.println(a + ":" + tokenizer.tokenize(a));
- }
-
- private final int gramLength;
-
- /**
- * padding used in q gram calculation.
- */
- private final char QGRAMENDPADDING = '$';
-
- /**
- * padding used in q gram calculation.
- */
- private final char QGRAMSTARTPADDING = '$';
-
- public NGramTokenizer() {
- gramLength = 3;
- }
-
- public NGramTokenizer(int gramLength) {
- this.gramLength = gramLength;
- }
-
- private StringBuffer getAdjustedString(String input) {
- final StringBuffer adjustedString = new StringBuffer();
- for (int i = 0; i < gramLength - 1; i++) {
- adjustedString.append(QGRAMSTARTPADDING);
- }
- adjustedString.append(input);
- for (int i = 0; i < gramLength - 1; i++) {
- adjustedString.append(QGRAMENDPADDING);
- }
- return adjustedString;
- }
-
- public List<String> tokenize(String input) {
- final ArrayList<String> returnVect = new ArrayList<String>();
- final StringBuffer adjustedString = getAdjustedString(input);
- int curPos = 0;
- final int length = adjustedString.length() - (gramLength - 1);
- final HashMap<String, Integer> grams = new HashMap<String, Integer>();
- while (curPos < length) {
- final String term = adjustedString.substring(curPos, curPos + gramLength);
- Integer count = grams.get(term);
- if (count == null) {
- count = new Integer(0);
- }
- count++;
- grams.put(term, count);
- returnVect.add(term + count);
- curPos++;
- }
- return returnVect;
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java
deleted file mode 100644
index 720d269..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-
-public class Token implements Serializable {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- private CharSequence data;
- private int start;
- private int length;
- private int count;
-
- /** Cache the hash code for the string */
- private int hash; // Default to 0
-
- public Token() {
- }
-
- public Token(CharSequence data, int start, int length, int count) {
- set(data, start, length, count);
- }
-
- @Override
- public boolean equals(Object o) {
- if (o == null) {
- return false;
- }
- if (!(o instanceof Token)) {
- return false;
- }
- Token t = (Token) o;
- if (t.length != length) {
- return false;
- }
- for (int i = 0; i < length; i++) {
- if (t.data.charAt(t.start + i) != data.charAt(start + i)) {
- return false;
- }
- }
- return true;
- }
-
- public CharSequence getCharSequence() {
- return data;
- }
-
- public int getCount() {
- return count;
- }
-
- public int getLength() {
- return length;
- }
-
- public int getStart() {
- return start;
- }
-
- @Override
- public int hashCode() {
- int h = hash;
- if (h == 0 && length > 0) {
- for (int i = 0; i < length; i++) {
- h = 31 * h + data.charAt(start + i);
- }
- h = 31 * h + count;
- hash = h;
- }
- return h;
- }
-
- public int length() {
- return length;
- }
-
- public void set(CharSequence data, int start, int length, int count) {
- this.data = data;
- this.start = start;
- this.length = length;
- this.count = count;
- hash = 0;
- }
-
- public void set(String data, int count) {
- this.data = data;
- start = 0;
- length = data.length();
- this.count = count;
- hash = 0;
- }
-
- @Override
- public String toString() {
- return "(" + data.subSequence(start, start + length) + ", " + count + ")";
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java
deleted file mode 100644
index 71078d5..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-import java.util.List;
-
-public interface Tokenizer extends Serializable {
- public List<String> tokenize(String text);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
deleted file mode 100644
index 19fcf18..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public interface TokenizerBuffered {
- public void advance();
-
- public boolean end();
-
- public Token getToken();
-
- public void reset();
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
deleted file mode 100644
index 2f4e8c6..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class TokenizerBufferedFactory {
- public static TokenizerBuffered getTokenizer(String tokenizer, StringBuilder buffer) {
- if (tokenizer.equals("Word")) {
- return new WordTokenizerBuffered(buffer);
- }
- throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
- }
-
- public static boolean isSeparator(char c) {
- return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER
- || Character.getType(c) == Character.OTHER_NUMBER);
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
deleted file mode 100644
index 9b1856a..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class TokenizerFactory {
- public static Tokenizer getTokenizer(String tokenizer, String wordSeparator, char tokenSeparator) {
- if (tokenizer.equals("NGram")) {
- return new NGramTokenizer();
- } else if (tokenizer.equals("Word")) {
- return new WordTokenizer(wordSeparator, tokenSeparator);
- }
- throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
deleted file mode 100644
index fa0bfe7..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-public class WordTokenizer implements Tokenizer {
-
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- public static void main(String args[]) {
- Tokenizer tokenizer = new WordTokenizer("_", '_');
- String a = "hadoop_rocks_in_java";
- System.out.println(a + ":" + tokenizer.tokenize(a));
- }
-
- private final String wordSeparator;
- private final char tokenSeparator;
-
- public WordTokenizer() {
- this(" ", '_');
- }
-
- public WordTokenizer(String wordSeparator, char tokenSeparator) {
- this.wordSeparator = wordSeparator;
- this.tokenSeparator = tokenSeparator;
- }
-
- public List<String> tokenize(String input) {
- final ArrayList<String> returnVect = new ArrayList<String>();
- final HashMap<String, Integer> tokens = new HashMap<String, Integer>();
- for (String term : input.split(wordSeparator)) {
- if (term.length() == 0) {
- continue;
- }
- Integer count = tokens.get(term);
- if (count == null) {
- count = 0;
- }
- count++;
- tokens.put(term, count);
- returnVect.add(term + tokenSeparator + count);
- }
- return returnVect;
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
deleted file mode 100644
index 29206f9..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import org.apache.asterix.fuzzyjoin.IntArray;
-
-public class WordTokenizerBuffered implements TokenizerBuffered {
-
- private final StringBuilder buffer;
- private int index;
- private final Token token;
-
- private final IntArray tokensStart, tokensLength;
-
- public WordTokenizerBuffered(StringBuilder buffer) {
- this.buffer = buffer;
- token = new Token();
- tokensStart = new IntArray();
- tokensLength = new IntArray();
- reset();
- }
-
- @Override
- public void advance() {
- while (index < buffer.length() && TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
- index++;
- }
- int start = index;
- while (index < buffer.length() && !TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
- buffer.setCharAt(index, Character.toLowerCase(buffer.charAt(index)));
- index++;
- }
- int length = index - start;
- int count = 1;
- if (length > 0) {
- // search if we got the same token before
- for (int i = 0; i < tokensStart.length(); ++i) {
- if (length == tokensLength.get(i)) {
- int tokenStart = tokensStart.get(i);
- count++; // assume we found it
- for (int j = 0; j < length; ++j) {
- if (buffer.charAt(start + j) != buffer.charAt(tokenStart + j)) {
- count--; // token not found
- break;
- }
- }
- }
- }
- // add the new token to the list of seen tokens
- tokensStart.add(start);
- tokensLength.add(length);
- }
- // set token
- token.set(buffer, start, length, count);
- }
-
- @Override
- public boolean end() {
- return token.length() <= 0;
- }
-
- @Override
- public Token getToken() {
- return token;
- }
-
- @Override
- public void reset() {
- index = 0;
- tokensStart.reset();
- tokensLength.reset();
- advance();
- }
-
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
deleted file mode 100644
index 90f8c6a..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.Serializable;
-
-public interface IntTokenCountRank extends Serializable {
- public int add(int token, int count);
-
- public int getRank(int token, int count);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
deleted file mode 100644
index d54c7d6..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.HashMap;
-
-import org.apache.asterix.fuzzyjoin.IntPair;
-
-public class IntTokenCountRankFrequency implements IntTokenCountRank {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- private final HashMap<IntPair, Integer> ranksMap = new HashMap<IntPair, Integer>();
- private final IntPair tmpPair = new IntPair();
- private int crtRank = 0;
-
- @Override
- public int add(int token, int count) {
- int prevRank = crtRank;
- ranksMap.put(new IntPair(token, count), prevRank);
- crtRank++;
- return prevRank;
- }
-
- @Override
- public int getRank(int token, int count) {
- tmpPair.set(token, count);
- Integer rank = ranksMap.get(tmpPair);
- if (rank == null) {
- return -1;
- }
- return rank;
- }
-
- @Override
- public String toString() {
- return "[" + crtRank + ",\n " + ranksMap + "\n]";
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
deleted file mode 100644
index b8e2082..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.Serializable;
-
-public interface IntTokenRank extends Serializable {
- public int add(int token);
-
- public int getRank(int token);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
deleted file mode 100644
index 08d1c93..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.HashMap;
-
-public class IntTokenRankFrequency implements IntTokenRank {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- private final HashMap<Integer, Integer> ranksMap = new HashMap<Integer, Integer>();
- private int crtRank = 0;
-
- @Override
- public int add(int token) {
- int prevRank = crtRank;
- ranksMap.put(token, prevRank);
- crtRank++;
- return prevRank;
- }
-
- @Override
- public int getRank(int token) {
- Integer rank = ranksMap.get(token);
- if (rank == null) {
- return -1;
- }
- return rank;
- }
-
- @Override
- public String toString() {
- return "[" + crtRank + ",\n " + ranksMap + "\n]";
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java
deleted file mode 100644
index 3578d94..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Serializable;
-
-import org.apache.asterix.fuzzyjoin.FuzzyJoinConfig;
-
-public class TokenLoad implements Serializable {
- private final String path;
- private final TokenRank rank;
-
- public TokenLoad(String path, TokenRank rank) {
- this.path = path;
- this.rank = rank;
- }
-
- public void loadTokenRank() {
- loadTokenRank(1);
- }
-
- public void loadTokenRank(int factor) {
- try (BufferedReader fis = new BufferedReader(
- // new FileReader(path.toString())
- new InputStreamReader(new FileInputStream(path), "UTF-8"))) {
- String token = null;
- while ((token = fis.readLine()) != null) {
- rank.add(token);
- // only used when increasing the token dictionary
- for (int i = 1; i < factor; i++) {
- // remove _COUNT at the end of the token (it is removed in
- // the new records anyway)
- rank.add(token.split(FuzzyJoinConfig.TOKEN_SEPARATOR_REGEX)[0] + i);
- }
- }
- } catch (IOException ioe) {
- throw new RuntimeException(ioe);
- }
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java
deleted file mode 100644
index 42cdfa7..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.Serializable;
-import java.util.Collection;
-
-public interface TokenRank extends Serializable {
- public int add(String token);
-
- public Integer getRank(String token);
-
- public Collection<Integer> getTokenRanks(Iterable<String> tokens);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
deleted file mode 100644
index 57fc325..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.Collection;
-import java.util.HashMap;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.Token;
-
-public class TokenRankBufferedFrequency implements TokenRank {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- private final HashMap<Token, Integer> ranksMap = new HashMap<Token, Integer>();
- private int crtRank = 0;
-
- public int add(String stringWithCount) {
- int end = stringWithCount.lastIndexOf('_');
- int count = 0;
- for (int i = end + 1; i < stringWithCount.length(); ++i) {
- count = count * 10 + (stringWithCount.charAt(i) - '0');
- }
- return add(stringWithCount.substring(0, end), count);
- }
-
- public int add(String string, int count) {
- Token token = new Token(string, 0, string.length(), count);
- return add(token);
- }
-
- public int add(Token token) {
- int prevRank = crtRank;
- ranksMap.put(token, prevRank);
- crtRank++;
- return prevRank;
- }
-
- @Override
- public Integer getRank(String token) {
- throw new UnsupportedOperationException();
- }
-
- public Integer getRank(Token token) {
- return ranksMap.get(token);
- }
-
- @Override
- public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public String toString() {
- return "[" + crtRank + ",\n " + ranksMap + "\n]";
- }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
deleted file mode 100644
index 97b95036..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.TreeSet;
-
-public class TokenRankFrequency implements TokenRank {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
-
- private final HashMap<String, Integer> ranksMap = new HashMap<String, Integer>();
- private int crtRank = 0;
-
- public int add(String token) {
- int prevRank = crtRank;
- ranksMap.put(token, prevRank);
- crtRank++;
- return prevRank;
- }
-
- public Integer getRank(String token) {
- return ranksMap.get(token);
- }
-
- public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
- TreeSet<Integer> ranksCol = new TreeSet<Integer>();
- for (String token : tokens) {
- Integer rank = getRank(token);
- if (rank != null) {
- ranksCol.add(rank);
- }
- }
- return ranksCol;
- }
-
- @Override
- public String toString() {
- return "[" + crtRank + ",\n " + ranksMap + "\n]";
- }
-}