adding fuzzyjoin code to git
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyFiltersJaccard.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyFiltersJaccard.java
new file mode 100644
index 0000000..75029a8
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyFiltersJaccard.java
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class FuzzyFiltersJaccard {
+
+ /**
+ * type is double because with float .8 / (1 + .8) * (8 + 10) = 8.0...01
+ */
+ protected final double simThr;
+ protected final double simThrExpr;
+
+ public FuzzyFiltersJaccard(double similarityThreshold) {
+ simThr = similarityThreshold;
+ simThrExpr = simThr / (1 + simThr);
+ }
+
+ public int getIndexPrefixLength(int length) {
+ return length - (int) Math.ceil(2 * simThrExpr * length) + 1;
+ }
+
+ public int getIntersectLowerBound(int lengthX, int lengthY) {
+ return (int) Math.ceil(simThrExpr * (lengthX + lengthY));
+ }
+
+ public long getIntersectLowerBound(long lengthX, long lengthY) {
+ return (long) Math.ceil(simThrExpr * (lengthX + lengthY));
+ }
+
+ public int getIntersectUpperBound(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
+ return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
+ }
+
+ public long getIntersectUpperBound(int noGramsCommon, long positionX, long positionY, long lengthX, long lengthY) {
+ return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
+ }
+
+ public int getLengthLowerBound(int length) {
+ return (int) Math.ceil(simThr * length);
+ }
+
+ public long getLengthLowerBound(long length) {
+ return (long) Math.ceil(simThr * length);
+ }
+
+ public int getPrefixLength(int length) {
+ return length - (int) Math.ceil(simThr * length) + 1;
+ }
+
+ public long getPrefixLength(long length) {
+ return length - (long) Math.ceil(simThr * length) + 1;
+ }
+
+ public double getSimilarityThreshold() {
+ return simThr;
+ }
+
+ public boolean passLengthFilter(int lengthX, int lengthY) {
+ return getLengthLowerBound(lengthX) <= lengthY && lengthY <= 1 / simThr * lengthX;
+ }
+
+ public boolean passLengthFilter(long lengthX, long lengthY) {
+ return getLengthLowerBound(lengthX) <= lengthY && lengthY <= 1 / simThr * lengthX;
+ }
+
+ /**
+ * @param noGramsCommon
+ * number of grams in common
+ * @param positionX
+ * position of the last gram in common on X
+ * @param positionY
+ * position of the last gram in common on X
+ * @param lengthX
+ * @param lengthY
+ * @return
+ */
+ public boolean passPositionFilter(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
+ return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX, lengthY) >= getIntersectLowerBound(
+ lengthX, lengthY);
+ }
+
+ public boolean passPositionFilter(int noGramsCommon, long positionX, long positionY, long lengthX, long lengthY) {
+ return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX, lengthY) >= getIntersectLowerBound(
+ lengthX, lengthY);
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinAppendLength.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
new file mode 100644
index 0000000..b08aa29
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
+
+public class FuzzyJoinAppendLength {
+ public static void main(String args[]) throws IOException {
+ final String inputFileName = args[0];
+ final String outputFileName = args[1];
+
+ BufferedReader input = new BufferedReader(new FileReader(inputFileName));
+ BufferedWriter output = new BufferedWriter(new FileWriter(outputFileName));
+
+ Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
+ FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
+
+ int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
+
+ String line;
+ HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>();
+ while ((line = input.readLine()) != null) {
+ String[] splits = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
+ Collection<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns,
+ FuzzyJoinConfig.TOKEN_SEPARATOR));
+ output.write(splits[0] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[1] + FuzzyJoinConfig.RECORD_SEPARATOR
+ + splits[2] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[3] + FuzzyJoinConfig.RECORD_SEPARATOR
+ + tokens.size() + "\n");
+ }
+
+ input.close();
+ output.close();
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinConfig.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinConfig.java
new file mode 100644
index 0000000..03806a6
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinConfig.java
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class FuzzyJoinConfig {
+ private static final String NAMESPACE = "fuzzyjoin";
+ //
+ // tokenizer
+ //
+ public static final String TOKENIZER_PROPERTY = NAMESPACE + ".tokenizer";
+ public static final String TOKENIZER_VALUE = "Word";
+ //
+ // similarity
+ //
+ public static final String SIMILARITY_NAME_PROPERTY = NAMESPACE + ".similarity.name";
+ public static final String SIMILARITY_NAME_VALUE = "Jaccard";
+ public static final String SIMILARITY_THRESHOLD_PROPERTY = NAMESPACE + ".similarity.threshold";
+ public static final float SIMILARITY_THRESHOLD_VALUE = .8f;
+ //
+ // record
+ //
+ public static final String RECORD_DATA_PROPERTY = NAMESPACE + ".record.data";
+ public static final String RECORD_DATA_VALUE = "1";
+ public static final String RECORD_DATA1_PROPERTY = NAMESPACE + ".record.data1";
+ public static final String RECORD_DATA1_VALUE = "1";
+ //
+ // data
+ //
+ public static final String DATA_TOKENS_PROPERTY = NAMESPACE + ".data.tokens";
+ //
+ // const
+ //
+ public static final String RECORD_DATA_VALUE_SEPARATOR_REGEX = ",";
+ public static final char WORD_SEPARATOR = '_';
+ public static final String WORD_SEPARATOR_REGEX = "_";
+ public static final char TOKEN_SEPARATOR = '_';
+ public static final String TOKEN_SEPARATOR_REGEX = "_";
+ public static final int RECORD_KEY = 0;
+ //
+ // separators
+ //
+ public static final char TOKEN_RANK_SEPARATOR = '_';
+ public static final char RECORD_SEPARATOR = ':';
+ public static final String RECORD_SEPARATOR_REGEX = ":";
+ public static final char RECORD_EXTRA_SEPARATOR = ';';
+ public static final String RECORD_EXTRA_SEPARATOR_REGEX = ";";
+ public static final char RIDPAIRS_SEPARATOR = ' ';
+ public static final String RIDPAIRS_SEPARATOR_REGEX = " ";
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinContext.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinContext.java
new file mode 100644
index 0000000..738bdc8
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinContext.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.util.ArrayList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard;
+
+public class FuzzyJoinContext {
+ public final float similarityThreshold;
+ public final SimilarityFiltersJaccard similarityFilters;
+ public final ArrayList<int[]> records;
+ public final ArrayList<ResultSelfJoin> results;
+
+ public FuzzyJoinContext(float similarityThreshold) {
+ this.similarityThreshold = similarityThreshold;
+ similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
+ records = new ArrayList<int[]>();
+ results = new ArrayList<ResultSelfJoin>();
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinMemory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinMemory.java
new file mode 100644
index 0000000..ec2a411
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinMemory.java
@@ -0,0 +1,317 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import edu.uci.ics.asterix.fuzzyjoin.invertedlist.InvertedListLengthList;
+import edu.uci.ics.asterix.fuzzyjoin.invertedlist.InvertedListsLengthList;
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard;
+
+public class FuzzyJoinMemory {
+ public static void main(String[] args) {
+ if (args.length < 2) {
+ System.err.println("Usage: <threshold> <file> [no runs, e.g., 1] [warm-up factor, e.g., 1]");
+ System.exit(2);
+ }
+
+ float similarityThreshold = Float.valueOf(args[0]);
+ String fileName = args[1];
+
+ int noRuns = 1, warmUpFactor = 1;
+ if (args.length > 2) {
+ noRuns = Integer.valueOf(args[2]);
+ if (args.length > 3) {
+ warmUpFactor = Integer.valueOf(args[3]);
+ }
+ }
+
+ System.err.println("Document: " + fileName);
+ System.err.println("... LOADING DATASET ...");
+
+ ArrayList<int[]> records = new ArrayList<int[]>();
+ ArrayList<Integer> rids = new ArrayList<Integer>();
+
+ FuzzyJoinMemory fj = new FuzzyJoinMemory(similarityThreshold);
+
+ FuzzyJoinMemory.readRecords(fileName, records, rids);
+
+ System.err.println("Algorithm: ppjoin");
+ System.err.println("Threshold: Jaccard " + similarityThreshold);
+
+ List<ResultSelfJoin> results = fj.runs(records, noRuns, warmUpFactor);
+
+ for (ResultSelfJoin result : results) {
+ System.out.format("%d %d %.3f", rids.get(result.indexX), rids.get(result.indexY), result.similarity);
+ System.out.println();
+ // System.out.format("(" + result.indexX + "," + result.indexY +
+ // ")");
+ // System.out.println();
+ // System.out.format("(" + rids.get(result.indexX) + ","
+ // + rids.get(result.indexY) + ")\t" + result.similarity);
+ // System.out.println();
+ }
+ }
+
+ public static void readRecords(String fileName, List<int[]> records, List<Integer> rids) {
+ LittleEndianIntInputStream in;
+ try {
+ in = new LittleEndianIntInputStream(new BufferedInputStream(new FileInputStream(fileName)));
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+
+ while (true) {
+ int rid = 0;
+ try {
+ rid = in.readInt();
+ } catch (IOException e) {
+ // FILE_EXPECTED reach of EOF
+ break;
+ }
+
+ rids.add(rid);
+ int[] record;
+
+ try {
+ int size = in.readInt();
+ record = new int[size];
+ for (int j = 0; j < size; j++) {
+ int token = in.readInt();
+ record[j] = token;
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ records.add(record);
+ }
+ }
+
+ private final InvertedListsLengthList invertedLists;
+ private final SimilarityFiltersJaccard similarityFilters;
+
+ private final ArrayList<int[]> records;
+
+ public FuzzyJoinMemory(float similarityThreshold) {
+ invertedLists = new InvertedListsLengthList();
+ similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
+ records = new ArrayList<int[]>();
+ }
+
+ public void add(final int[] tokens) {
+ final int index = records.size();
+ final int length = tokens.length;
+ final int indexPrefixLength = similarityFilters.getPrefixLength(length);
+
+ for (int indexToken = 0; indexToken < indexPrefixLength; indexToken++) {
+ invertedLists.index(tokens[indexToken], new int[] { index, indexToken, length });
+ }
+ records.add(tokens);
+ }
+
+ public ArrayList<ResultJoin> join(final int[] tokens, final int length) {
+ final int prefixLength = similarityFilters.getPrefixLength(length);
+ final int lengthLowerBound = similarityFilters.getLengthLowerBound(length);
+ //
+ // self join
+ //
+ final HashMap<Integer, Integer> counts = new HashMap<Integer, Integer>();
+ for (int indexToken = 0; indexToken < Math.min(prefixLength, tokens.length); indexToken++) {
+ final int token = tokens[indexToken];
+ //
+ // probe index
+ //
+ InvertedListLengthList invertedList = invertedLists.get(token);
+ if (invertedList != null) {
+ // length filter
+ invertedList.setMinLength(lengthLowerBound);
+ for (int[] element : invertedList) {
+ final int indexProbe = element[0];
+ final int indexTokenProbe = element[1];
+ final int lengthProbe = element[2];
+ Integer count = counts.get(indexProbe);
+ if (count == null) {
+ count = 0;
+ }
+
+ if (count != -1) {
+ count++;
+ // position filter
+ if (!similarityFilters.passPositionFilter(count, indexToken, length, indexTokenProbe,
+ lengthProbe)) {
+ count = -1;
+ }
+ // suffix filter
+ if (count == 1
+ && !similarityFilters.passSuffixFilter(tokens, indexToken, records.get(indexProbe),
+ indexTokenProbe)) {
+ count = -1;
+ }
+ counts.put(indexProbe, count);
+ }
+ }
+ }
+ }
+ //
+ // verify candidates
+ //
+ ArrayList<ResultJoin> results = new ArrayList<ResultJoin>();
+ for (Map.Entry<Integer, Integer> cand : counts.entrySet()) {
+ int count = cand.getValue();
+ int indexProbe = cand.getKey();
+ if (count > 0) {
+ int tokensProbe[] = records.get(indexProbe);
+ float similarity = similarityFilters.passSimilarityFilter(tokens, prefixLength, tokensProbe,
+ similarityFilters.getPrefixLength(tokensProbe.length), count);
+ if (similarity > 0) {
+ results.add(new ResultJoin(indexProbe, similarity));
+ }
+ }
+ }
+ return results;
+ }
+
+ public void prune(int length) {
+ final int lengthLowerBound = similarityFilters.getLengthLowerBound(length + 1);
+ invertedLists.prune(lengthLowerBound);
+ }
+
+ public List<ResultSelfJoin> runs(Collection<int[]> records, int noRuns, int warmupFactor) {
+ if (records.size() < 2) {
+ return new ArrayList<ResultSelfJoin>();
+ }
+
+ int noRunsTotal = noRuns * warmupFactor;
+ float runtime = 0, runtimeAverage = 0;
+ ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+
+ System.err.println("# Records: " + records.size());
+ System.err.print("=== BEGIN JOIN (TIMER STARTED) === ");
+ for (int i = 1; i <= noRunsTotal; i++) {
+ System.err.print(".");
+ System.err.flush();
+
+ results.clear();
+ Runtime.getRuntime().gc();
+
+ Date startTime = new Date();
+ for (int[] record : records) {
+ results.addAll(selfJoinAndAddRecord(record));
+ }
+ Date endTime = new Date();
+ runtime = (endTime.getTime() - startTime.getTime()) / (float) 1000.0;
+
+ if (i >= noRunsTotal - noRuns) {
+ runtimeAverage += runtime;
+ }
+ }
+ System.err.println();
+ System.err.println("# Results: " + results.size());
+ System.err.println("=== END JOIN (TIMER STOPPED) ===");
+ System.err.println("Total Running Time: " + runtimeAverage / noRuns + " (" + runtime + ")");
+ System.err.println();
+ return results;
+ }
+
+ public ArrayList<ResultSelfJoin> selfJoinAndAddRecord(final int[] tokens) {
+ final int index = records.size();
+ final int length = tokens.length;
+ final int prefixLength = similarityFilters.getPrefixLength(length);
+ final int indexPrefixLength = similarityFilters.getIndexPrefixLength(length);
+ final int lengthLowerBound = similarityFilters.getLengthLowerBound(length);
+ //
+ // self join
+ //
+ final HashMap<Integer, Integer> counts = new HashMap<Integer, Integer>();
+ for (int indexToken = 0; indexToken < prefixLength; indexToken++) {
+ final int token = tokens[indexToken];
+ //
+ // probe index
+ //
+ InvertedListLengthList invertedList = invertedLists.get(token);
+ if (invertedList != null) {
+ // length filter
+ invertedList.setMinLength(lengthLowerBound);
+ for (int[] element : invertedList) {
+ final int indexProbe = element[0];
+ final int indexTokenProbe = element[1];
+ final int lengthProbe = element[2];
+ Integer count = counts.get(indexProbe);
+ if (count == null) {
+ count = 0;
+ }
+
+ if (count != -1) {
+ count++;
+ // position filter
+ if (!similarityFilters.passPositionFilter(count, indexToken, length, indexTokenProbe,
+ lengthProbe)) {
+ count = -1;
+ }
+ // suffix filter
+ if (count == 1
+ && !similarityFilters.passSuffixFilter(tokens, indexToken, records.get(indexProbe),
+ indexTokenProbe)) {
+ count = -1;
+ }
+ counts.put(indexProbe, count);
+ }
+ }
+ }
+ //
+ // add to index
+ //
+ if (indexToken < indexPrefixLength) {
+ invertedLists.index(token, new int[] { index, indexToken, length });
+ }
+ }
+ //
+ // add record
+ //
+ records.add(tokens);
+ //
+ // verify candidates
+ //
+ ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+ for (Map.Entry<Integer, Integer> cand : counts.entrySet()) {
+ int count = cand.getValue();
+ int indexProbe = cand.getKey();
+ if (count > 0) {
+ int tokensProbe[] = records.get(indexProbe);
+ float similarity = similarityFilters.passSimilarityFilter(tokens, prefixLength, tokensProbe,
+ similarityFilters.getIndexPrefixLength(tokensProbe.length), count);
+ if (similarity > 0) {
+ results.add(new ResultSelfJoin(index, indexProbe, similarity));
+ }
+ }
+ }
+ return results;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinTokenize.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinTokenize.java
new file mode 100644
index 0000000..aa8927d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinTokenize.java
@@ -0,0 +1,135 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenorder.TokenLoad;
+import edu.uci.ics.asterix.fuzzyjoin.tokenorder.TokenRank;
+import edu.uci.ics.asterix.fuzzyjoin.tokenorder.TokenRankFrequency;
+
+public class FuzzyJoinTokenize {
+ public static class TokenCount implements Comparable {
+ public String token;
+ public MutableInteger count;
+
+ public TokenCount(String token, MutableInteger count) {
+ this.token = token;
+ this.count = count;
+ }
+
+ @Override
+ public int compareTo(Object o) {
+ TokenCount tc = (TokenCount) o;
+ return count.compareTo(tc.count);
+ }
+
+ public String getToken() {
+ return token;
+ }
+
+ @Override
+ public String toString() {
+ return token + " " + count;
+ }
+ }
+
+ public static void main(String args[]) throws IOException {
+ final String inputFileName = args[0];
+ final String tokensFileName = args[1];
+ final String tokenizedFileName = args[2];
+
+ BufferedReader input = new BufferedReader(new FileReader(inputFileName));
+
+ Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
+ FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
+
+ int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
+
+ String line;
+ HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>();
+ while ((line = input.readLine()) != null) {
+ Collection<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(
+ line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX), dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
+
+ for (String token : tokens) {
+ MutableInteger count = tokenCount.get(token);
+ if (count == null) {
+ tokenCount.put(token, new MutableInteger(1));
+ } else {
+ count.inc();
+ }
+ }
+ }
+
+ input.close();
+
+ ArrayList<TokenCount> tokenCounts = new ArrayList<TokenCount>();
+ for (Map.Entry<String, MutableInteger> entry : tokenCount.entrySet()) {
+ tokenCounts.add(new TokenCount(entry.getKey(), entry.getValue()));
+ }
+ Collections.sort(tokenCounts);
+
+ BufferedWriter outputTokens = new BufferedWriter(new FileWriter(tokensFileName));
+ for (TokenCount tc : tokenCounts) {
+ outputTokens.write(tc.getToken() + "\n");
+ }
+ outputTokens.close();
+
+ TokenRank tokenRank = new TokenRankFrequency();
+ TokenLoad tokenLoad = new TokenLoad(tokensFileName, tokenRank);
+ tokenLoad.loadTokenRank();
+
+ input = new BufferedReader(new FileReader(inputFileName));
+ LittleEndianIntOutputStream outputTokenized = new LittleEndianIntOutputStream(new BufferedOutputStream(
+ new FileOutputStream(tokenizedFileName)));
+ while ((line = input.readLine()) != null) {
+ String splits[] = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
+ int rid = Integer.parseInt(splits[FuzzyJoinConfig.RECORD_KEY]);
+ outputTokenized.writeInt(rid);
+ Collection<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns,
+ FuzzyJoinConfig.TOKEN_SEPARATOR));
+ Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
+ outputTokenized.writeInt(tokensRanked.size());
+ for (Integer token : tokensRanked) {
+ outputTokenized.writeInt(token);
+ }
+ // for (int i = 0; i < tokens.size() - tokensRanked.size(); i++) {
+ // outputTokenized.writeInt(Integer.MAX_VALUE);
+ // }
+ }
+
+ input.close();
+ outputTokenized.close();
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinUtil.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinUtil.java
new file mode 100644
index 0000000..d947e24
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinUtil.java
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.util.regex.Pattern;
+
+public class FuzzyJoinUtil {
+ private static final Pattern rePunctuation = Pattern.compile("[^\\p{L}\\p{N}]"); // L:Letter, N:Number
+ private static final Pattern reSpaceOrAnderscore = Pattern.compile("(_|\\s)+");
+
+ public static String clean(String in) {
+ /*
+ * - remove punctuation
+ *
+ * - normalize case
+ *
+ * - remove extra spaces
+ *
+ * - repalce space with FuzzyJoinDriver.TOKEN_SEPARATOR
+ */
+
+ in = rePunctuation.matcher(in).replaceAll(" ");
+ in = reSpaceOrAnderscore.matcher(in).replaceAll(" ");
+ in = in.trim();
+ in = in.replace(' ', '_');
+ in = in.toLowerCase();
+ return in;
+ }
+
+ /**
+ * @param splits
+ * splitted record
+ * @param dataColumns
+ * column index of data columns
+ * @param tokenSeparator
+ * TODO
+ * @return concatenation of data column values
+ */
+ public static String getData(Object[] splits, int[] dataColumns, char tokenSeparator) {
+ String data = null;
+ for (int dataColumn : dataColumns) {
+ if (data != null) {
+ data += tokenSeparator;
+ }
+ if (splits.length > dataColumn) {
+ if (data == null) {
+ data = "";
+ }
+ // data += splits[dataColumns[i]];
+ data += clean(splits[dataColumn].toString());
+ }
+ }
+ return data;
+ }
+
+ /**
+ * @param columnsString
+ * string containing the indexes of the columns containing data
+ * @return array of data columns indexes
+ */
+ public static int[] getDataColumns(String columnsString) {
+ String[] splits = columnsString.split(FuzzyJoinConfig.RECORD_DATA_VALUE_SEPARATOR_REGEX);
+ int[] columns = new int[splits.length];
+ for (int i = 0; i < splits.length; i++) {
+ columns[i] = Integer.parseInt(splits[i]);
+ }
+ return columns;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntArray.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntArray.java
new file mode 100644
index 0000000..7170e87
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntArray.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.util.Arrays;
+
+public class IntArray {
+ private static final int SIZE = 128;
+
+ private int[] data;
+ private int length;
+
+ public IntArray() {
+ data = new int[SIZE];
+ length = 0;
+ }
+
+ public void add(int d) {
+ if (length == data.length) {
+ data = Arrays.copyOf(data, data.length << 1);
+ }
+ data[length++] = d;
+ }
+
+ public int[] get() {
+ return data;
+ }
+
+ public int get(int i) {
+ return data[i];
+ }
+
+ public int length() {
+ return length;
+ }
+
+ public void reset() {
+ length = 0;
+ }
+
+ public void sort() {
+ sort(0, length);
+ }
+
+ public void sort(int start, int end) {
+ Arrays.sort(data, start, end);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder out = new StringBuilder();
+ out.append('[');
+ for (int i = 0; i < length; ++i) {
+ out.append(data[i]);
+ if (i < length - 1) {
+ out.append(',');
+ out.append(' ');
+ }
+ }
+ out.append(']');
+ return out.toString();
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntPair.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntPair.java
new file mode 100644
index 0000000..1d54e82
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntPair.java
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class IntPair {
+
+ public static int[] PRIME = new int[] { 17, 31 }; // used for hashCode
+ // computations
+
+ protected int first;
+
+ protected int second;
+
+ public IntPair() {
+ }
+
+ public IntPair(int first, int second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ public int compareTo(Object o) {
+ if (this == o) {
+ return 0;
+ }
+ IntPair p = (IntPair) o;
+ if (first != p.first) {
+ return first < p.first ? -1 : 1;
+ }
+ return second < p.second ? -1 : second > p.second ? 1 : 0;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof IntPair)) {
+ return false;
+ }
+ IntPair p = (IntPair) o;
+ return first == p.first && second == p.second;
+ }
+
+ public int getFirst() {
+ return first;
+ }
+
+ public int getSecond() {
+ return second;
+ }
+
+ @Override
+ public int hashCode() {
+ return first * PRIME[0] + second;
+ }
+
+ public void set(int first, int second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ public void setFirst(int first) {
+ this.first = first;
+ }
+
+ public void setSecond(int second) {
+ this.second = second;
+ }
+
+ @Override
+ public String toString() {
+ return "(" + first + "," + second + ")";
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntInputStream.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntInputStream.java
new file mode 100644
index 0000000..e17b694
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntInputStream.java
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.EOFException;
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+public class LittleEndianIntInputStream extends FilterInputStream {
+
+ public LittleEndianIntInputStream(InputStream in) {
+ super(in);
+ }
+
+ public int readInt() throws IOException {
+ int a = read();
+ if (a == -1) {
+ throw new EOFException();
+ }
+ int b = read();
+ if (b == -1) {
+ throw new EOFException();
+ }
+ int c = read();
+ if (c == -1) {
+ throw new EOFException();
+ }
+ int d = read();
+ if (d == -1) {
+ throw new EOFException();
+ }
+ return (a | (b << 8) | (c << 16) | (d << 24)); // little endian
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntOutputStream.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntOutputStream.java
new file mode 100644
index 0000000..8b45338
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntOutputStream.java
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.FilterOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class LittleEndianIntOutputStream extends FilterOutputStream {
+ public LittleEndianIntOutputStream(OutputStream in) {
+ super(in);
+ }
+
+ public void writeInt(int v) throws IOException {
+ write((byte) (0xff & v));
+ write((byte) (0xff & (v >> 8)));
+ write((byte) (0xff & (v >> 16)));
+ write((byte) (0xff & (v >> 24)));
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/MutableInteger.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/MutableInteger.java
new file mode 100644
index 0000000..8dd8748
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/MutableInteger.java
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class MutableInteger implements Comparable {
+ private int v;
+
+ public MutableInteger(int v) {
+ this.v = v;
+ }
+
+ @Override
+ public int compareTo(Object o) {
+ MutableInteger m = (MutableInteger) o;
+ return v - m.v;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (o == this) {
+ return true;
+ }
+ if (!(o instanceof MutableInteger)) {
+ return false;
+ }
+ MutableInteger m = (MutableInteger) o;
+ if (m.v == v) {
+ return true;
+ }
+ return false;
+ }
+
+ public int get() {
+ return v;
+ }
+
+ public void inc() {
+ v += 1;
+ }
+
+ public void set(int v) {
+ this.v = v;
+ }
+
+ @Override
+ public String toString() {
+ return "" + v;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/RIDPairSimilarity.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/RIDPairSimilarity.java
new file mode 100644
index 0000000..64a3ea1
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/RIDPairSimilarity.java
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class RIDPairSimilarity {
+ public int rid1, rid2;
+ public float similarity;
+
+ public RIDPairSimilarity() {
+ }
+
+ public RIDPairSimilarity(int rid1, int rid2, float similarity) {
+ set(rid1, rid2, similarity);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ RIDPairSimilarity r = (RIDPairSimilarity) o;
+ return rid1 == r.rid1 && rid2 == r.rid2;
+ }
+
+ @Override
+ public int hashCode() {
+ return rid1 * rid2 * (rid1 - rid2);
+ }
+
+ public void set(int rid1, int rid2, float similarity) {
+ this.rid1 = rid1;
+ this.rid2 = rid2;
+ this.similarity = similarity;
+ }
+
+ @Override
+ public String toString() {
+ return "{(" + rid1 + ", " + rid2 + "), " + similarity + "}";
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultJoin.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultJoin.java
new file mode 100644
index 0000000..f4b6c34
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultJoin.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class ResultJoin {
+ public int index;
+ public float similarity;
+
+ public ResultJoin(int index, float similarity) {
+ this.index = index;
+ this.similarity = similarity;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultSelfJoin.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultSelfJoin.java
new file mode 100644
index 0000000..9e2658c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultSelfJoin.java
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class ResultSelfJoin {
+ public int indexX, indexY;
+ public float similarity;
+
+ public ResultSelfJoin(int indexX, int indexY, float similarity) {
+ this.indexX = indexX;
+ this.indexY = indexY;
+ this.similarity = similarity;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedList.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedList.java
new file mode 100644
index 0000000..fe0553c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedList.java
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+
+public interface InvertedList extends Iterable<int[]> {
+ public void add(int[] element);
+
+ public void setMinLength(int length);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthFixed.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthFixed.java
new file mode 100644
index 0000000..ed8db19
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthFixed.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.Iterator;
+
+public class InvertedListLengthFixed implements InvertedList {
+ public class ListIterator implements Iterator<int[]> {
+
+ int ix;
+
+ public ListIterator(int ix) {
+ this.ix = ix;
+ }
+
+ public boolean hasNext() {
+ return ix < sz;
+ }
+
+ public int[] next() {
+ return list[ix++];
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private final int[][] list;
+
+ private int sz, ix;
+
+ public InvertedListLengthFixed(int size) {
+ list = new int[size * 3][3];
+ sz = 0;
+ ix = 0;
+ }
+
+ public void add(int[] element) {
+ list[sz++] = element;
+ }
+
+ public int getIndex() {
+ return ix;
+ }
+
+ public int getSize() {
+ return sz;
+ }
+
+ public Iterator<int[]> iterator() {
+ // return Arrays.asList(list).iterator();
+ return new ListIterator(ix);
+ }
+
+ public void setMinLength(int minLength) {
+ while (ix < sz && list[ix][2] < minLength) {
+ ix++;
+ }
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthList.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthList.java
new file mode 100644
index 0000000..6322040
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthList.java
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+public class InvertedListLengthList implements InvertedList {
+ /**
+ * @author rares
+ * assumes that ListLength(s) are not empty inside the
+ * InvertedListLength
+ */
+ private class ListIterator implements Iterator<int[]> {
+
+ private final Iterator<ListLength> iteratorLength;
+ private Iterator<int[]> iteratorList;
+
+ public ListIterator() {
+ iteratorLength = list.iterator();
+ iteratorList = null;
+ }
+
+ public boolean hasNext() {
+ return (iteratorList != null && iteratorList.hasNext()) || iteratorLength.hasNext();
+ }
+
+ public int[] next() {
+ if (iteratorList != null && iteratorList.hasNext()) {
+ return iteratorList.next();
+ }
+ iteratorList = iteratorLength.next().list.iterator();
+ return iteratorList.next();
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private class ListLength {
+ public int length;
+ public ArrayList<int[]> list = new ArrayList<int[]>();
+
+ @Override
+ public String toString() {
+ StringBuffer l = new StringBuffer("(");
+ for (int[] i : list) {
+ l.append(Arrays.toString(i));
+ l.append(",");
+ }
+ l.append(")");
+ return "(length:" + length + "," + l + ")";
+ }
+ }
+
+ private LinkedList<ListLength> list;
+
+ public InvertedListLengthList() {
+ list = new LinkedList<ListLength>();
+ }
+
+ public void add(int[] element) {
+ if (!list.isEmpty() && list.getLast().length == element[2]) {
+ list.getLast().list.add(element);
+ } else {
+ ListLength listLength = new ListLength();
+ listLength.length = element[2];
+ listLength.list.add(element);
+ list.add(listLength);
+ }
+ }
+
+ public Iterator<int[]> iterator() {
+ return new ListIterator();
+ }
+
+ public void setMinLength(int minLength) {
+ while (!list.isEmpty() && list.getFirst().length < minLength) {
+ list.removeFirst();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return list.toString();
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListPlain.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListPlain.java
new file mode 100644
index 0000000..b8c37c5
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListPlain.java
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+public class InvertedListPlain {
+ private List<int[]> list;
+
+ public InvertedListPlain() {
+ list = new ArrayList<int[]>();
+ }
+
+ public boolean add(int[] element) {
+ list.add(element);
+ return true;
+ }
+
+ public Iterator<int[]> iterator() {
+ return list.iterator();
+ }
+
+ public void setMinLength(int minLength) {
+ }
+
+ @Override
+ public String toString() {
+ return list.toString();
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedLists.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedLists.java
new file mode 100644
index 0000000..7696a38
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedLists.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+public interface InvertedLists {
+ public InvertedList get(int token);
+
+ public void index(int token, int[] element);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthFixed.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthFixed.java
new file mode 100644
index 0000000..db21980
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthFixed.java
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.Map;
+
+public class InvertedListsLengthFixed implements InvertedLists {
+ public final InvertedListLengthFixed[] invertedLists;
+ public final Map<Integer, Integer> invertedListsSize;
+
+ public InvertedListsLengthFixed(int noTokens, Map<Integer, Integer> invertedListsSize) {
+ invertedLists = new InvertedListLengthFixed[noTokens];
+ this.invertedListsSize = invertedListsSize;
+ }
+
+ public InvertedList get(int token) {
+ return invertedLists[token];
+ }
+
+ public void index(int token, int[] element) {
+ if (invertedLists[token] == null) {
+ // invertedLists[token] = new InvertedListLengthList();
+ invertedLists[token] = new InvertedListLengthFixed(invertedListsSize.get(token));
+ }
+ invertedLists[token].add(element);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthList.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthList.java
new file mode 100644
index 0000000..21f24b7
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthList.java
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.HashMap;
+
+public class InvertedListsLengthList implements InvertedLists {
+ public final HashMap<Integer, InvertedListLengthList> invertedLists;
+
+ public InvertedListsLengthList() {
+ invertedLists = new HashMap<Integer, InvertedListLengthList>();
+ }
+
+ public InvertedListLengthList get(int token) {
+ return invertedLists.get(token);
+ }
+
+ public void index(int token, int[] element) {
+ InvertedListLengthList list = invertedLists.get(token);
+ if (list == null) {
+ list = new InvertedListLengthList();
+ invertedLists.put(token, list);
+ }
+ list.add(element);
+ }
+
+ public void prune(int minLength) {
+ for (InvertedListLengthList l : invertedLists.values()) {
+ l.setMinLength(minLength);
+ }
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroup.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroup.java
new file mode 100644
index 0000000..9341a60
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroup.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public abstract class RecordGroup {
+ protected final int noGroups;
+ protected final SimilarityFilters fuzzyFilters;
+
+ public RecordGroup(int noGroups, SimilarityFilters fuzzyFilters) {
+ this.noGroups = noGroups;
+ this.fuzzyFilters = fuzzyFilters;
+ }
+
+ public abstract Iterable<Integer> getGroups(Integer token, Integer length);
+
+ public abstract boolean isLengthOnly();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupFactory.java
new file mode 100644
index 0000000..9b60d4e
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupFactory.java
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupFactory {
+ public static RecordGroup getRecordGroup(String recordGroup, int noGroups, SimilarityFilters fuzzyFilters,
+ String lengthstatsPath) {
+ if (recordGroup.equals("LengthCount")) {
+ return new RecordGroupLengthCount(noGroups, fuzzyFilters, lengthstatsPath);
+ } else if (recordGroup.equals("LengthIdentity")) {
+ return new RecordGroupLengthIdentity(noGroups, fuzzyFilters);
+ } else if (recordGroup.equals("LengthRange")) {
+ return new RecordGroupLengthRange(noGroups, fuzzyFilters, lengthstatsPath);
+ } else if (recordGroup.equals("Single")) {
+ return new RecordGroupSingle(noGroups, fuzzyFilters);
+ } else if (recordGroup.equals("TokenIdentity")) {
+ return new RecordGroupTokenIdentity(noGroups, fuzzyFilters);
+ } else if (recordGroup.equals("TokenFrequency")) {
+ return new RecordGroupTokenFrequency(noGroups, fuzzyFilters);
+ } else if (recordGroup.equals("TokenFrequencyMirror")) {
+ return new RecordGroupTokenFrequencyMirror(noGroups, fuzzyFilters);
+ }
+ throw new RuntimeException("Unknown record group \"" + recordGroup + "\".");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthCount.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthCount.java
new file mode 100644
index 0000000..785346a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthCount.java
@@ -0,0 +1,111 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupLengthCount extends RecordGroup {
+ private final int min;
+ private final int max;
+ private final int[] lengthGroups;
+
+ public RecordGroupLengthCount(int noGroups, SimilarityFilters fuzzyFilters, String lengthstatsPath) {
+ super(noGroups, fuzzyFilters);
+
+ int sum = 0;
+ int range = 0;
+
+ try {
+ DataInputStream in = new DataInputStream(new FileInputStream(lengthstatsPath.toString()));
+ min = in.readInt();
+ max = in.readInt();
+ range = max - min + 1;
+ lengthGroups = new int[range]; // stores freqencies initally
+ try {
+ while (true) {
+ int length = in.readInt();
+ int freq = in.readInt();
+
+ int lowAbs = fuzzyFilters.getLengthLowerBound(length);
+ int uppAbs = fuzzyFilters.getLengthUpperBound(length);
+
+ int low = Math.max(lowAbs - min, 0);
+ int upp = Math.min(uppAbs - min, max - min);
+
+ for (int l = low; l <= upp; ++l) {
+ lengthGroups[l] += freq;
+ sum += freq;
+ }
+ }
+ } catch (EOFException e) {
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+
+ int countGroup = sum / noGroups;
+ int count = 0;
+ int group = 0;
+ for (int i = 0; i < range; ++i) {
+ count += lengthGroups[i];
+ lengthGroups[i] = group;
+ if (count >= countGroup && group < noGroups - 1) {
+ count = 0;
+ group++;
+ }
+ }
+ }
+
+ @Override
+ public Iterable<Integer> getGroups(Integer token, Integer length) {
+ int lowAbs = fuzzyFilters.getLengthLowerBound(length);
+ int uppAbs = fuzzyFilters.getLengthUpperBound(length);
+
+ int low = Math.max(lowAbs - min, 0);
+ int upp = Math.min(uppAbs - min, max - min);
+
+ ArrayList<Integer> groups = new ArrayList<Integer>(upp - low + 1);
+ int prevGroup = -1;
+ for (int l = low; l <= upp; ++l) {
+ int group = lengthGroups[l];
+ if (group != prevGroup) {
+ groups.add(group);
+ prevGroup = group;
+ }
+ }
+
+ // System.out.println(noGroups + ":" + length + " [" + low + "," + upp
+ // + "] " + groups);
+
+ return groups;
+ }
+
+ @Override
+ public boolean isLengthOnly() {
+ return true;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthIdentity.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthIdentity.java
new file mode 100644
index 0000000..558eada
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthIdentity.java
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupLengthIdentity extends RecordGroup {
+
+ private final Collection<Integer> groups;
+
+ public RecordGroupLengthIdentity(int noGroups, SimilarityFilters fuzzyFilters) {
+ super(noGroups, fuzzyFilters);
+ groups = new LinkedList<Integer>();
+ }
+
+ @Override
+ public Iterable<Integer> getGroups(Integer token, Integer length) {
+ groups.clear();
+ groups.add(length);
+ return groups;
+ }
+
+ @Override
+ public boolean isLengthOnly() {
+ return true;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthRange.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthRange.java
new file mode 100644
index 0000000..2bdd02d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthRange.java
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupLengthRange extends RecordGroup {
+ private final int min;
+ private final int max;
+ private final int groupSize;
+
+ public RecordGroupLengthRange(int noGroups, SimilarityFilters fuzzyFilters, String lengthstatsPath) {
+ super(noGroups, fuzzyFilters);
+ try {
+ DataInputStream in = new DataInputStream(new FileInputStream(lengthstatsPath.toString()));
+ min = in.readInt();
+ max = in.readInt();
+ groupSize = (int) Math.ceil((max - min + 1f) / noGroups);
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+
+ @Override
+ public Iterable<Integer> getGroups(Integer token, Integer length) {
+ int lowAbs = fuzzyFilters.getLengthLowerBound(length);
+ int uppAbs = fuzzyFilters.getLengthUpperBound(length);
+
+ int low = Math.max(lowAbs - min, 0);
+ int upp = Math.min(uppAbs - min, max - min);
+
+ ArrayList<Integer> groups = new ArrayList<Integer>(upp - low + 1);
+ int prevGroup = -1;
+ for (int l = low; l <= upp; ++l) {
+ int group = l / groupSize;
+ if (group != prevGroup) {
+ groups.add(group);
+ prevGroup = group;
+ }
+ }
+
+ // System.out.println(length + " [" + lowAbs + "," + uppAbs + "] "
+ // + groups);
+
+ return groups;
+ }
+
+ @Override
+ public boolean isLengthOnly() {
+ return true;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupSingle.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupSingle.java
new file mode 100644
index 0000000..0f9480f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupSingle.java
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupSingle extends RecordGroup {
+
+ private final Collection<Integer> groups = new LinkedList<Integer>();
+
+ public RecordGroupSingle(int noGroups, SimilarityFilters fuzzyFilters) {
+ super(noGroups, fuzzyFilters);
+ groups.add(0);
+ }
+
+ @Override
+ public Iterable<Integer> getGroups(Integer token, Integer length) {
+ return groups;
+ }
+
+ @Override
+ public boolean isLengthOnly() {
+ return true;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequency.java
new file mode 100644
index 0000000..f0cd032
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequency.java
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupTokenFrequency extends RecordGroup {
+
+ private final Collection<Integer> groups = new LinkedList<Integer>();
+
+ public RecordGroupTokenFrequency(int noGroups, SimilarityFilters fuzzyFilters) {
+ super(noGroups, fuzzyFilters);
+ }
+
+ @Override
+ public Iterable<Integer> getGroups(Integer token, Integer length) {
+ groups.clear();
+ groups.add(token % noGroups);
+ return groups;
+ }
+
+ @Override
+ public boolean isLengthOnly() {
+ return false;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequencyMirror.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequencyMirror.java
new file mode 100644
index 0000000..36085a2
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequencyMirror.java
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupTokenFrequencyMirror extends RecordGroup {
+
+ private final Collection<Integer> groups = new LinkedList<Integer>();
+
+ public RecordGroupTokenFrequencyMirror(int noGroups, SimilarityFilters fuzzyFilters) {
+ super(noGroups, fuzzyFilters);
+ }
+
+ @Override
+ public Iterable<Integer> getGroups(Integer token, Integer length) {
+ int noGroupsDouble = noGroups << 1;
+ int group = token % noGroupsDouble;
+ if (group >= noGroups) {
+ // mirror
+ group = -(group - noGroupsDouble) - 1;
+ }
+ groups.clear();
+ groups.add(group);
+ return groups;
+ }
+
+ @Override
+ public boolean isLengthOnly() {
+ return false;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenIdentity.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenIdentity.java
new file mode 100644
index 0000000..691fb10
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenIdentity.java
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupTokenIdentity extends RecordGroup {
+
+ private final Collection<Integer> groups;
+
+ public RecordGroupTokenIdentity(int noGroups, SimilarityFilters fuzzyFilters) {
+ super(noGroups, fuzzyFilters);
+ groups = new LinkedList<Integer>();
+ }
+
+ @Override
+ public Iterable<Integer> getGroups(Integer token, Integer length) {
+ groups.clear();
+ groups.add(token);
+ return groups;
+ }
+
+ @Override
+ public boolean isLengthOnly() {
+ return false;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
new file mode 100644
index 0000000..85d1785
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public interface IGenericSimilarityMetric {
+ // returns similarity
+ public float getSimilarity(IListIterator firstList, IListIterator secondList);
+
+ // returns -1 if does not satisfy threshold
+ // else returns similarity
+ public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IListIterator.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IListIterator.java
new file mode 100644
index 0000000..647c35f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IListIterator.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public interface IListIterator {
+ public int compare(IListIterator cmpIter);
+
+ public byte[] getData();
+
+ public int getPos();
+
+ public boolean hasNext();
+
+ public void next();
+
+ public void reset();
+
+ public int size();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/PartialIntersect.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/PartialIntersect.java
new file mode 100644
index 0000000..15c339e
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/PartialIntersect.java
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public class PartialIntersect {
+ public int intersectSize;
+ public int posXStart;
+ public int posXStop;
+ public int posYStart;
+ public int posYStop;
+
+ private boolean startSet = false;
+
+ public boolean isSet() {
+ return startSet;
+ }
+
+ public void reset() {
+ startSet = false;
+ }
+
+ public void set() {
+ startSet = true;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFilters.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFilters.java
new file mode 100644
index 0000000..48b22ca
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFilters.java
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.io.Serializable;
+
+public interface SimilarityFilters extends Serializable {
+ public int getLengthLowerBound(int length);
+
+ public int getLengthUpperBound(int length);
+
+ public int getPrefixLength(int length);
+
+ public boolean passLengthFilter(int lengthX, int lengthY);
+
+ public boolean passPositionFilter(int noGramsCommon, int positionX, int lengthX, int positionY, int lengthY);
+
+ public float passSimilarityFilter(final int[] tokensX, int startX, int lengthX, final int prefixLengthX,
+ final int[] tokensY, int startY, int lengthY, final int prefixLengthY, final int intersectionSizePrefix);
+
+ public float passSimilarityFilter(final int[] tokensX, final int prefixLengthX, final int[] tokensY,
+ final int prefixLengthY, final int intersectionSizePrefix);
+
+ public boolean passSuffixFilter(int[] tokensX, int startX, int lengthX, int positionX, int[] tokensY, int startY,
+ int lengthY, int positionY);
+
+ public boolean passSuffixFilter(int[] tokensX, int positionX, int[] tokensY, int positionY);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersFactory.java
new file mode 100644
index 0000000..07c68a3
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersFactory.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public class SimilarityFiltersFactory {
+ public static SimilarityFilters getSimilarityFilters(String similarityName, float similarityThreshold) {
+ if ("jaccard".equalsIgnoreCase(similarityName)) {
+ return new SimilarityFiltersJaccard(similarityThreshold);
+ }
+ throw new RuntimeException("Unknown fuzzy filters \"" + similarityName + "\".");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersJaccard.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersJaccard.java
new file mode 100644
index 0000000..db9fefa
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersJaccard.java
@@ -0,0 +1,297 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.util.Arrays;
+
+public class SimilarityFiltersJaccard implements SimilarityFilters {
+ class Partition {
+ public int startL;
+ public int lengthL;
+ public int startR;
+ public int lengthR;
+ public int hamming;
+
+ public Partition() {
+ }
+
+ public Partition(int startL, int lengthL, int startR, int lengthR, int hamming) {
+ this.startL = startL;
+ this.lengthL = lengthL;
+ this.startR = startR;
+ this.lengthR = lengthR;
+ this.hamming = hamming;
+ }
+ }
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ private static final int MAX_DEPTH = 2;
+
+ public static int getLengthLowerBound(int length, float simThr) {
+ return (int) Math.ceil(simThr * length);
+ }
+
+ public static boolean passLengthFilter(int lengthX, int lengthY, float simThr) {
+ return getLengthLowerBound(lengthX, simThr) <= lengthY && lengthY <= 1 / simThr * lengthX;
+ }
+
+ protected float simThr;
+
+ protected float simThr100;
+
+ public SimilarityFiltersJaccard(float similarityThreshold) {
+ reset(similarityThreshold);
+ }
+
+ public int getIndexPrefixLength(int length) {
+ return length - (int) Math.ceil(2 * simThr100 / (100 + simThr100) * length) + 1;
+ }
+
+ public int getIntersectLowerBound(int lengthX, int lengthY) {
+ return (int) Math.ceil(simThr100 * (lengthX + lengthY) / (100 + simThr100));
+ }
+
+ public int getIntersectUpperBound(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
+ return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
+ }
+
+ public int getLengthLowerBound(int length) {
+ return getLengthLowerBound(length, simThr);
+ }
+
+ public int getLengthUpperBound(int length) {
+ return (int) Math.floor(1 / simThr * length);
+ }
+
+ private Partition getPartition(int[] tokens, int start, int length, int w, int posL, int posR) {
+ int p;
+ if (tokens[posL] > w) {
+ p = posL;
+ } else if (tokens[posR] < w) {
+ p = posR;
+ } else {
+ p = Arrays.binarySearch(tokens, start, start + length, w);
+ }
+
+ if (p < 0) {
+ p = -p - 1;
+ }
+
+ if (p >= start && p < start + length && tokens[p] == w) {
+ return new Partition(start, p - start, p + 1, start + length - p - 1, 0);
+ }
+ return new Partition(start, p - start, p, start + length - p, 1);
+ }
+
+ public int getPrefixLength(int length) {
+ if (length == 0) {
+ return 0;
+ }
+ return length - (int) Math.ceil(simThr * length) + 1;
+ }
+
+ public float getSimilarityThreshold() {
+ return simThr;
+ }
+
+ private int getSuffixFilter(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY,
+ int hammingMax, int depth) {
+ final int lengthDiff = Math.abs(lengthX - lengthY);
+
+ if (depth > MAX_DEPTH || lengthX == 0 || lengthY == 0) {
+ return lengthDiff;
+ }
+
+ final int mid = startY + lengthY / 2 + lengthY % 2 - 1;
+ final int offset = (hammingMax - lengthDiff) / 2;
+
+ int offsetL;
+ int offsetR;
+ if (lengthX < lengthY) {
+ offsetL = 1;
+ offsetR = 0;
+ } else {
+ offsetL = 0;
+ offsetR = 1;
+ }
+ Partition partitionY = new Partition(startY, mid - startY, mid + 1, startY + lengthY - mid - 1, 0);
+
+ // Partition partitionX = getPartition(tokensX, startX, lengthX,
+ // tokensY[mid], Math.max(Math.min(mid + startX - startY, startX
+ // + lengthX - 1)
+ // - offset - Math.abs(lengthX - lengthY) * offsetL,
+ // startX), Math.min(Math.max(mid + startX - startY,
+ // startX)
+ // + offset + Math.abs(lengthX - lengthY) * offsetR,
+ // startX + lengthX - 1));
+
+ Partition partitionX = getPartition(tokensX, startX, lengthX, tokensY[mid],
+ Math.max(mid + startX - startY - offset - lengthDiff * offsetL, startX),
+ Math.min(mid + startX - startY + offset + lengthDiff * offsetR, startX + lengthX - 1));
+
+ int hammingPart = partitionX.hamming;
+
+ int hamming = Math.abs(partitionX.lengthL - partitionY.lengthL)
+ + Math.abs(partitionX.lengthR - partitionY.lengthR) + hammingPart;
+
+ if (hamming <= hammingMax) {
+ int hammingL = getSuffixFilter(tokensX, partitionX.startL, partitionX.lengthL, tokensY, partitionY.startL,
+ partitionY.lengthL, hammingMax - Math.abs(partitionX.lengthR - partitionY.lengthR) - hammingPart,
+ depth + 1);
+ hamming = hammingL + Math.abs(partitionX.lengthR - partitionY.lengthR) + hammingPart;
+
+ if (hamming <= hammingMax) {
+ int hammingR = getSuffixFilter(tokensX, partitionX.startR, partitionX.lengthR, tokensY,
+ partitionY.startR, partitionY.lengthR, hammingMax - hammingL - hammingPart, depth + 1);
+ hamming = hammingL + hammingR + hammingPart;
+ }
+ }
+ return hamming;
+ }
+
+ public boolean passLengthFilter(int lengthX, int lengthY) {
+ return passLengthFilter(lengthX, lengthY, simThr);
+ }
+
+ /**
+ * @param noGramsCommon
+ * number of grams in common
+ * @param positionX
+ * position of the last gram in common on X
+ * @param positionY
+ * position of the last gram in common on X
+ * @param lengthX
+ * @param lengthY
+ * @return
+ */
+ public boolean passPositionFilter(int noGramsCommon, int positionX, int lengthX, int positionY, int lengthY) {
+ return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX, lengthY) >= getIntersectLowerBound(
+ lengthX, lengthY);
+ }
+
+ public float passSimilarityFilter(final int[] tokensX, int startX, int lengthX, final int prefixLengthX,
+ final int[] tokensY, int startY, int lengthY, final int prefixLengthY, final int intersectionSizePrefix) {
+ final int length = lengthX;
+ final int token = tokensX[startX + Math.min(prefixLengthX, lengthX) - 1];
+ final int lengthProbe = lengthY;
+ final int tokenProbe = tokensY[startY + prefixLengthY - 1];
+
+ final int intersectSizeLowerBound = getIntersectLowerBound(length, lengthProbe);
+ int intersectSize = 0;
+
+ if (token < tokenProbe) {
+ if (intersectionSizePrefix + length - prefixLengthX >= intersectSizeLowerBound) {
+ intersectSize = intersectionSizePrefix
+ + SimilarityMetric.getIntersectSize(tokensX, startX + prefixLengthX, lengthX - prefixLengthX,
+ tokensY, startY + intersectionSizePrefix, lengthY - intersectionSizePrefix);
+ }
+ } else {
+ if (intersectionSizePrefix + lengthProbe - prefixLengthY >= intersectSizeLowerBound) {
+ intersectSize = intersectionSizePrefix
+ + SimilarityMetric.getIntersectSize(tokensX, startX + intersectionSizePrefix, lengthX
+ - intersectionSizePrefix, tokensY, startY + prefixLengthY, lengthY - prefixLengthY);
+ }
+ }
+
+ if (intersectSize >= intersectSizeLowerBound) {
+ return ((float) intersectSize) / (length + lengthProbe - intersectSize);
+ }
+ return 0;
+ }
+
+ /**
+ * @param tokensX
+ * @param prefixLengthX
+ * @param tokensY
+ * @param prefixLengthY
+ * @param intersectionSizePrefix
+ * @return similarity if it is above or equal to the similarity threshold, 0
+ * otherwise
+ */
+ public float passSimilarityFilter(final int[] tokensX, final int prefixLengthX, final int[] tokensY,
+ final int prefixLengthY, final int intersectionSizePrefix) {
+ // final int length = tokensX.length;
+ // final int token = tokensX[Math.min(prefixLengthX, tokensX.length) -
+ // 1];
+ // final int lengthProbe = tokensY.length;
+ // final int tokenProbe = tokensY[prefixLengthY - 1];
+ //
+ // final int intersectSizeLowerBound = getIntersectLowerBound(length,
+ // lengthProbe);
+ // int intersectSize = 0;
+ //
+ // if (token < tokenProbe) {
+ // if (intersectionSizePrefix + length - prefixLengthX >=
+ // intersectSizeLowerBound) {
+ // intersectSize = intersectionSizePrefix
+ // + SimilarityMetric.getIntersectSize(tokensX,
+ // prefixLengthX, tokensY, intersectionSizePrefix);
+ // }
+ // } else {
+ // if (intersectionSizePrefix + lengthProbe - prefixLengthY >=
+ // intersectSizeLowerBound) {
+ // intersectSize = intersectionSizePrefix
+ // + SimilarityMetric.getIntersectSize(tokensX,
+ // intersectionSizePrefix, tokensY, prefixLengthY);
+ // }
+ // }
+ //
+ // if (intersectSize >= intersectSizeLowerBound) {
+ // return ((float) intersectSize)
+ // / (length + lengthProbe - intersectSize);
+ // }
+ // return 0;
+ return passSimilarityFilter(tokensX, 0, tokensX.length, prefixLengthX, tokensY, 0, tokensY.length,
+ prefixLengthY, intersectionSizePrefix);
+ }
+
+ public boolean passSuffixFilter(int[] tokensX, int tokensStartX, int tokensLengthX, int positionX, int[] tokensY,
+ int tokensStartY, int tokensLengthY, int positionY) {
+ int hammingMax = tokensLengthX + tokensLengthY - 2
+ * (int) Math.ceil(simThr100 / (100 + simThr100) * (tokensLengthX + tokensLengthY))
+ - (positionX + 1 + positionY + 1 - 2);
+ int hamming = getSuffixFilter(tokensX, tokensStartX + positionX + 1, tokensLengthX - positionX - 1, tokensY,
+ tokensStartY + positionY + 1, tokensLengthY - positionY - 1, hammingMax, 1);
+ return hamming <= hammingMax;
+ }
+
+ public boolean passSuffixFilter(int[] tokensX, int positionX, int[] tokensY, int positionY) {
+ // int hammingMax = tokensX.length
+ // + tokensY.length
+ // - 2
+ // * (int) Math.ceil(simThr100 / (100 + simThr100)
+ // * (tokensX.length + tokensY.length))
+ // - (positionX + 1 + positionY + 1 - 2);
+ // int hamming = getSuffixFilter(tokensX, positionX + 1, tokensX.length
+ // - positionX - 1, tokensY, positionY + 1, tokensY.length
+ // - positionY - 1, hammingMax, 1);
+ // return hamming <= hammingMax;
+ return passSuffixFilter(tokensX, 0, tokensX.length, positionX, tokensY, 0, tokensY.length, positionY);
+ }
+
+ public void reset(float similarityThreshold) {
+ simThr = similarityThreshold;
+ simThr100 = simThr * 100;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetric.java
new file mode 100644
index 0000000..415e785
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -0,0 +1,184 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+
+public abstract class SimilarityMetric {
+
+ public static int getIntersectSize(IListIterator tokensX, IListIterator tokensY) {
+ int intersectSize = 0;
+ while (tokensX.hasNext() && tokensY.hasNext()) {
+ int cmp = tokensX.compare(tokensY);
+ if (cmp > 0) {
+ tokensY.next();
+ } else if (cmp < 0) {
+ tokensX.next();
+ } else {
+ intersectSize++;
+ tokensX.next();
+ tokensY.next();
+ }
+ }
+ return intersectSize;
+ }
+
+ public static int getIntersectSize(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY) {
+ int posX = 0;
+ int posY = 0;
+ int intersectSize = 0;
+
+ while (posX < lengthX && posY < lengthY) {
+ int tokenX = tokensX[startX + posX];
+ int tokenY = tokensY[startY + posY];
+ if (tokenX > tokenY) {
+ posY++;
+ } else if (tokenX < tokenY) {
+ posX++;
+ } else {
+ intersectSize++;
+ posX++;
+ posY++;
+ }
+ }
+
+ return intersectSize;
+ }
+
+ public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
+ // int intersectSize = 0;
+ //
+ // while (startX < tokensX.length && startY < tokensY.length) {
+ // int tokenX = tokensX[startX];
+ // int tokenY = tokensY[startY];
+ // if (tokenX > tokenY) {
+ // startY++;
+ // } else if (tokenX < tokenY) {
+ // startX++;
+ // } else {
+ // intersectSize++;
+ // startX++;
+ // startY++;
+ // }
+ // }
+ //
+ // return intersectSize;
+ return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
+ }
+
+ public static int getIntersectSize(int[] tokensX, int[] tokensY) {
+ return getIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length);
+ }
+
+ public static PartialIntersect getPartialIntersectSize(int[] tokensX, int startX, int lengthX, int[] tokensY,
+ int startY, int lengthY, int tokenStop) {
+ PartialIntersect parInter = new PartialIntersect();
+ getPartialIntersectSize(tokensX, startX, lengthX, tokensY, startY, lengthY, tokenStop, parInter);
+ return parInter;
+ }
+
+ public static void getPartialIntersectSize(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY,
+ int lengthY, int tokenStop, PartialIntersect parInter) {
+ int posX = 0;
+ int posY = 0;
+ int intersectSize = 0;
+
+ parInter.reset();
+ while (posX < lengthX && posY < lengthY) {
+ int tokenX = tokensX[startX + posX];
+ int tokenY = tokensY[startY + posY];
+ if (tokenX > tokenY) {
+ posY++;
+ } else if (tokenX < tokenY) {
+ posX++;
+ } else {
+ intersectSize++;
+ if (!parInter.isSet()) {
+ parInter.posXStart = posX;
+ parInter.posYStart = posY;
+ parInter.set();
+ }
+ if (tokenX == tokenStop) {
+ parInter.posXStop = posX;
+ parInter.posYStop = posY;
+ parInter.intersectSize = intersectSize;
+ }
+ posX++;
+ posY++;
+ }
+ }
+ }
+
+ public static PartialIntersect getPartialIntersectSize(int[] tokensX, int[] tokensY, int tokenStop) {
+ return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
+ }
+
+ // @SuppressWarnings("unchecked")
+ // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
+ // int intersectSize = 0;
+ //
+ // Iterator<Tuple> iteratorX = tokensX.iterator();
+ // Iterator<Tuple> iteratorY = tokensY.iterator();
+ //
+ // Tuple nextX = null;
+ // Tuple nextY = null;
+ //
+ // while ((nextX != null || iteratorX.hasNext())
+ // && (nextY != null || iteratorY.hasNext())) {
+ // if (nextX == null) {
+ // nextX = iteratorX.next();
+ // }
+ // if (nextY == null) {
+ // nextY = iteratorY.next();
+ // }
+ //
+ // int cmp = nextX.compareTo(nextY);
+ // if (cmp > 0) {
+ // nextY = null;
+ // } else if (cmp < 0) {
+ // nextX = null;
+ // } else {
+ // intersectSize++;
+ // nextX = null;
+ // nextY = null;
+ // }
+ // }
+ //
+ // return intersectSize;
+ // }
+
+ // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
+
+ // public abstract float getSimilarity(DataBag tokensX, int lengthX,
+ // DataBag tokensY, int lengthY);
+
+ public float getSimilarity(IListIterator tokensX, IListIterator tokensY) {
+ int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
+ int totalSize = tokensX.size() + tokensY.size();
+
+ return (float) intersectionSize / (totalSize - intersectionSize);
+ }
+
+ public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
+
+ public abstract float getSimilarity(int[] tokensX, int[] tokensY);
+
+ public abstract float getSimilarity(String stringX, String stringY, Tokenizer tokenizer);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
new file mode 100644
index 0000000..b99d6f7
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.util.Arrays;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.StringUtils;
+
+public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
+
+ private final int utf8SizeIndicatorSize = 2;
+
+ // dp implementation only needs 2 rows
+ private final int rows = 2;
+ private int cols;
+ private int[][] matrix;
+
+ // for letter count filtering
+ private final int[] fsLcCount = new int[128];
+ private final int[] ssLcCount = new int[128];
+
+ public SimilarityMetricEditDistance() {
+ cols = 100; // arbitrary default value
+ matrix = new int[rows][cols];
+ }
+
+ @Override
+ public float getSimilarity(IListIterator firstList, IListIterator secondList) {
+ int flLen = firstList.size();
+ int slLen = secondList.size();
+
+ // reuse existing matrix if possible
+ if (slLen >= cols) {
+ cols = slLen + 1;
+ matrix = new int[rows][cols];
+ }
+
+ // init matrix
+ for (int i = 0; i <= slLen; i++) {
+ matrix[0][i] = i;
+ }
+
+ int currRow = 1;
+ int prevRow = 0;
+
+ // expand dynamic programming matrix row by row
+ for (int i = 1; i <= flLen; i++) {
+ matrix[currRow][0] = i;
+
+ secondList.reset();
+ for (int j = 1; j <= slLen; j++) {
+
+ matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+ matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
+
+ secondList.next();
+ }
+
+ firstList.next();
+
+ int tmp = currRow;
+ currRow = prevRow;
+ prevRow = tmp;
+ }
+
+ return matrix[prevRow][slLen];
+ }
+
+ @Override
+ public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh) {
+
+ int edThresh = (int) simThresh;
+
+ int flLen = firstList.size();
+ int slLen = secondList.size();
+
+ // length filter
+ if (Math.abs(flLen - slLen) > edThresh) {
+ return -1;
+ }
+
+ float ed = getSimilarity(firstList, secondList);
+ if (ed > edThresh) {
+ return -1;
+ } else {
+ return ed;
+ }
+ }
+
+ // faster implementation for common case of string edit distance
+ public int UTF8StringEditDistance(byte[] bytes, int fsStart, int ssStart) {
+
+ int fsLen = StringUtils.getStrLen(bytes, fsStart);
+ int ssLen = StringUtils.getStrLen(bytes, ssStart);
+
+ // reuse existing matrix if possible
+ if (ssLen >= cols) {
+ cols = ssLen + 1;
+ matrix = new int[rows][cols];
+ }
+
+ int fsDataStart = fsStart + utf8SizeIndicatorSize;
+ int ssDataStart = ssStart + utf8SizeIndicatorSize;
+
+ // init matrix
+ for (int i = 0; i <= ssLen; i++) {
+ matrix[0][i] = i;
+ }
+
+ int currRow = 1;
+ int prevRow = 0;
+
+ // expand dynamic programming matrix row by row
+ int fsPos = fsDataStart;
+ for (int i = 1; i <= fsLen; i++) {
+ matrix[currRow][0] = i;
+ char fsChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, fsPos));
+
+ int ssPos = ssDataStart;
+ for (int j = 1; j <= ssLen; j++) {
+ char ssChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, ssPos));
+
+ matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+ matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
+
+ ssPos += StringUtils.charSize(bytes, ssPos);
+ }
+
+ fsPos += StringUtils.charSize(bytes, fsPos);
+
+ int tmp = currRow;
+ currRow = prevRow;
+ prevRow = tmp;
+ }
+
+ return matrix[prevRow][ssLen];
+ }
+
+ public int UTF8StringEditDistance(byte[] bytes, int fsStart, int ssStart, int edThresh) {
+
+ int fsUtfLen = StringUtils.getUTFLen(bytes, fsStart);
+ int ssUtfLen = StringUtils.getUTFLen(bytes, ssStart);
+
+ // length filter
+ if (Math.abs(fsUtfLen - ssUtfLen) > edThresh) {
+ return -1;
+ }
+
+ // initialize letter count filtering
+ Arrays.fill(fsLcCount, 0);
+ Arrays.fill(ssLcCount, 0);
+
+ // compute letter counts for first string
+ int fsPos = fsStart + utf8SizeIndicatorSize;
+ int fsEnd = fsPos + fsUtfLen;
+ while (fsPos < fsEnd) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(bytes, fsPos));
+ if (c < 128) {
+ fsLcCount[c]++;
+ }
+ fsPos += StringUtils.charSize(bytes, fsPos);
+ }
+
+ // compute letter counts for second string
+ int ssPos = ssStart + utf8SizeIndicatorSize;
+ int ssEnd = ssPos + ssUtfLen;
+ while (ssPos < ssEnd) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(bytes, ssPos));
+ if (c < 128) {
+ ssLcCount[c]++;
+ }
+ ssPos += StringUtils.charSize(bytes, ssPos);
+ }
+
+ // apply filter
+ int gtSum = 0;
+ int ltSum = 0;
+ for (int i = 0; i < 128; i++) {
+ if (fsLcCount[i] > ssLcCount[i]) {
+ gtSum += fsLcCount[i] - ssLcCount[i];
+ if (gtSum > edThresh) {
+ return -1;
+ }
+ } else {
+ ltSum += ssLcCount[i] - fsLcCount[i];
+ if (ltSum > edThresh) {
+ return -1;
+ }
+ }
+ }
+
+ int ed = UTF8StringEditDistance(bytes, fsStart, ssStart);
+ if (ed > edThresh) {
+ return -1;
+ } else {
+ return ed;
+ }
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricFactory.java
new file mode 100644
index 0000000..a66328b
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricFactory.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public class SimilarityMetricFactory {
+ public static SimilarityMetric getSimilarityMetric(String similarityMetric) {
+ if (similarityMetric.equals("Jaccard")) {
+ return new SimilarityMetricJaccard();
+ }
+ throw new RuntimeException("Unknown fuzzy metric \"" + similarityMetric + "\".");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
new file mode 100644
index 0000000..b0c0638
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -0,0 +1,118 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.util.Set;
+import java.util.TreeSet;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+
+public class SimilarityMetricJaccard extends SimilarityMetric implements IGenericSimilarityMetric {
+
+ public static float getSimilarity(int intersectSize, int lengthX, int lengthY) {
+ return ((float) intersectSize) / (lengthX + lengthY - intersectSize);
+ }
+
+ public static float getSimilarityBag(int[] tokensX, int[] tokensY) {
+ Set<Integer> setX = new TreeSet<Integer>();
+ for (int token : tokensX) {
+ setX.add(token);
+ }
+ Set<Integer> setY = new TreeSet<Integer>();
+ for (int token : tokensY) {
+ setY.add(token);
+ }
+ setX.retainAll(setY);
+ return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
+ }
+
+ // @Override
+ // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
+ // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
+ // (int) tokensY.size());
+ // }
+
+ // @Override
+ // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
+ // int lengthY) {
+ // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
+ // tokensY);
+ // int totalSize = lengthX + lengthY;
+ //
+ // return (float) intersectionSize / (totalSize - intersectionSize);
+ // }
+
+ @Override
+ public float getSimilarity(IListIterator tokensX, IListIterator tokensY) {
+ int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
+ int totalSize = tokensX.size() + tokensY.size();
+
+ return (float) intersectionSize / (totalSize - intersectionSize);
+ }
+
+ @Override
+ public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh) {
+
+ // apply length filter
+ int lengthLowerBound = (int) Math.ceil(simThresh * firstList.size());
+
+ boolean passesLengthFilter = (lengthLowerBound <= secondList.size())
+ && (secondList.size() <= 1.0f / simThresh * firstList.size());
+ if (!passesLengthFilter) {
+ return -1f;
+ }
+
+ float jacc = getSimilarity(firstList, secondList);
+ if (jacc < simThresh) {
+ return -1f;
+ } else {
+ return jacc;
+ }
+ }
+
+ @Override
+ public float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY) {
+ int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, startX, lengthX, tokensY, startY, lengthY);
+ int totalSize = lengthX + lengthY;
+
+ return (float) intersectionSize / (totalSize - intersectionSize);
+ }
+
+ @Override
+ public float getSimilarity(int[] tokensX, int[] tokensY) {
+ return getSimilarity(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length);
+ }
+
+ @Override
+ public float getSimilarity(String stringX, String stringY, Tokenizer tokenizer) {
+ Set<String> setX = new TreeSet<String>();
+ for (String token : tokenizer.tokenize(stringX)) {
+ setX.add(token);
+ }
+ Set<String> setY = new TreeSet<String>();
+ for (String token : tokenizer.tokenize(stringY)) {
+ setY.add(token);
+ }
+ int lengthX = setX.size();
+ int lengthY = setY.size();
+ setX.retainAll(setY);
+ return ((float) setX.size()) / (lengthX + lengthY - setX.size());
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..0e3c28f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8StringBinaryTokenizer.java
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import edu.uci.ics.asterix.fuzzyjoin.IntArray;
+
+public abstract class AbstractUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+
+ protected byte[] data;
+ protected int start;
+ protected int length;
+ protected int tokenLength;
+ protected int index;
+ protected int utf8Length;
+
+ protected final IntArray tokensStart;
+ protected final IntArray tokensLength;
+ protected final IToken token;
+
+ protected final boolean ignoreTokenCount;
+ protected final boolean sourceHasTypeTag;
+
+ public AbstractUTF8StringBinaryTokenizer(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+ ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ if (!ignoreTokenCount) {
+ tokensStart = new IntArray();
+ tokensLength = new IntArray();
+ } else {
+ tokensStart = null;
+ tokensLength = null;
+ }
+ token = tokenFactory.createToken();
+ }
+
+ @Override
+ public IToken getToken() {
+ return token;
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ this.start = start;
+ index = this.start;
+ if (sourceHasTypeTag) {
+ index++; // skip type tag
+ }
+ utf8Length = StringUtils.getUTFLen(data, index);
+ index += 2; // skip utf8 length indicator
+ this.data = data;
+ this.length = length + start;
+
+ tokenLength = 0;
+ if (!ignoreTokenCount) {
+ tokensStart.reset();
+ tokensLength.reset();
+ }
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8Token.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8Token.java
new file mode 100644
index 0000000..d478f36
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8Token.java
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public abstract class AbstractUTF8Token implements IToken {
+ public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+
+ protected int length;
+ protected int tokenLength;
+ protected int start;
+ protected int tokenCount;
+ protected byte[] data;
+ protected final byte tokenTypeTag;
+ protected final byte countTypeTag;
+
+ public AbstractUTF8Token() {
+ tokenTypeTag = -1;
+ countTypeTag = -1;
+ }
+
+ public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+ this.tokenTypeTag = tokenTypeTag;
+ this.countTypeTag = countTypeTag;
+ }
+
+ @Override
+ public byte[] getData() {
+ return data;
+ }
+
+ @Override
+ public int getLength() {
+ return length;
+ }
+
+ public int getLowerCaseUTF8Len(int size) {
+ int lowerCaseUTF8Len = 0;
+ int pos = start;
+ for (int i = 0; i < size; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+ pos += StringUtils.charSize(data, pos);
+ }
+ return lowerCaseUTF8Len;
+ }
+
+ @Override
+ public int getStart() {
+ return start;
+ }
+
+ @Override
+ public int getTokenLength() {
+ return tokenLength;
+ }
+
+ public void handleCountTypeTag(DataOutput dos) throws IOException {
+ if (countTypeTag > 0) {
+ dos.write(countTypeTag);
+ }
+ }
+
+ public void handleTokenTypeTag(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+ this.data = data;
+ this.start = start;
+ this.length = length;
+ this.tokenLength = tokenLength;
+ this.tokenCount = tokenCount;
+ }
+
+ @Override
+ public void serializeTokenCount(DataOutput dos) throws IOException {
+ handleCountTypeTag(dos);
+ dos.writeInt(tokenCount);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8TokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8TokenFactory.java
new file mode 100644
index 0000000..5ade0dc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8TokenFactory.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public abstract class AbstractUTF8TokenFactory implements ITokenFactory {
+ private static final long serialVersionUID = 1L;
+ protected final byte tokenTypeTag;
+ protected final byte countTypeTag;
+
+ public AbstractUTF8TokenFactory() {
+ tokenTypeTag = -1;
+ countTypeTag = -1;
+ }
+
+ public AbstractUTF8TokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ this.tokenTypeTag = tokenTypeTag;
+ this.countTypeTag = countTypeTag;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..5a12c00
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizer.java
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class DelimitedUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
+
+ public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+ ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ }
+
+ @Override
+ public boolean hasNext() {
+ // skip delimiters
+ while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ }
+ return index < length;
+ }
+
+ private boolean isSeparator(char c) {
+ return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+ }
+
+ @Override
+ public void next() {
+ tokenLength = 0;
+ int currentTokenStart = index;
+ while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ tokenLength++;
+ }
+ int tokenCount = 1;
+ if (tokenLength > 0 && !ignoreTokenCount) {
+ // search if we got the same token before
+ for (int i = 0; i < tokensStart.length(); ++i) {
+ if (tokenLength == tokensLength.get(i)) {
+ int tokenStart = tokensStart.get(i);
+ tokenCount++; // assume we found it
+ int offset = 0;
+ int currLength = 0;
+ while (currLength < tokenLength) {
+ // case insensitive comparison
+ if (StringUtils.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != StringUtils
+ .toLowerCase(StringUtils.charAt(data, tokenStart + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, currentTokenStart + offset);
+ currLength++;
+ }
+ }
+ }
+ // add the new token to the list of seen tokens
+ tokensStart.add(currentTokenStart);
+ tokensLength.add(tokenLength);
+ }
+
+ // set token
+ token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..cb002df
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+
+ private static final long serialVersionUID = 1L;
+ private final boolean ignoreTokenCount;
+ private final boolean sourceHasTypeTag;
+ private final ITokenFactory tokenFactory;
+
+ public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+ ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ this.tokenFactory = tokenFactory;
+ }
+
+ @Override
+ public IBinaryTokenizer createTokenizer() {
+ return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramToken.java
new file mode 100644
index 0000000..461638f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramToken.java
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class HashedUTF8NGramToken extends UTF8NGramToken {
+ public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ int hash = GOLDEN_RATIO_32;
+
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ hash ^= PRECHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
+
+ // regular chars
+ int numRegGrams = tokenLength - numPreChars - numPostChars;
+ int pos = start;
+ for (int i = 0; i < numRegGrams; i++) {
+ hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ hash ^= POSTCHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
+
+ // token count
+ hash += tokenCount;
+
+ dos.writeInt(hash);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
new file mode 100644
index 0000000..9b6f835
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public HashedUTF8NGramTokenFactory() {
+ super();
+ }
+
+ public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
new file mode 100644
index 0000000..c7d5f3a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class HashedUTF8WordToken extends UTF8WordToken {
+
+ private int hash = 0;
+
+ public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (!(o instanceof IToken)) {
+ return false;
+ }
+ IToken t = (IToken) o;
+ if (t.getTokenLength() != tokenLength) {
+ return false;
+ }
+ int offset = 0;
+ for (int i = 0; i < tokenLength; i++) {
+ if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils.charAt(data, start + offset)) {
+ return false;
+ }
+ offset += StringUtils.charSize(data, start + offset);
+ }
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return hash;
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+ super.reset(data, start, length, tokenLength, tokenCount);
+
+ // pre-compute hash value using JAQL-like string hashing
+ int pos = start;
+ hash = GOLDEN_RATIO_32;
+ for (int i = 0; i < tokenLength; i++) {
+ hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
+ hash += tokenCount;
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
+
+ // serialize hash value
+ dos.writeInt(hash);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
new file mode 100644
index 0000000..aec26bc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public HashedUTF8WordTokenFactory() {
+ super();
+ }
+
+ public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
new file mode 100644
index 0000000..2403e2f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public interface IBinaryTokenizer {
+ public IToken getToken();
+
+ public boolean hasNext();
+
+ public void next();
+
+ public void reset(byte[] data, int start, int length);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..eecd98c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+ public IBinaryTokenizer createTokenizer();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/INGramToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/INGramToken.java
new file mode 100644
index 0000000..ae55837
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/INGramToken.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public interface INGramToken {
+ public int getNumPostChars();
+
+ public int getNumPreChars();
+
+ public void setNumPrePostChars(int numPreChars, int numPostChars);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IToken.java
new file mode 100644
index 0000000..aa6dc86
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IToken.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public interface IToken {
+ public byte[] getData();
+
+ public int getLength();
+
+ public int getStart();
+
+ public int getTokenLength();
+
+ public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount);
+
+ public void serializeToken(DataOutput dos) throws IOException;
+
+ public void serializeTokenCount(DataOutput dos) throws IOException;
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/ITokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
new file mode 100644
index 0000000..e5a95a1
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+
+public interface ITokenFactory extends Serializable {
+ public IToken createToken();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
new file mode 100644
index 0000000..e909a24
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+public class NGramTokenizer implements Tokenizer {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ public static void main(String args[]) {
+ Tokenizer tokenizer = new NGramTokenizer();
+ String a = "hadoopoop";
+ System.out.println(a + ":" + tokenizer.tokenize(a));
+ }
+
+ private final int gramLength;
+
+ /**
+ * padding used in q gram calculation.
+ */
+ private final char QGRAMENDPADDING = '$';
+
+ /**
+ * padding used in q gram calculation.
+ */
+ private final char QGRAMSTARTPADDING = '$';
+
+ public NGramTokenizer() {
+ gramLength = 3;
+ }
+
+ public NGramTokenizer(int gramLength) {
+ this.gramLength = gramLength;
+ }
+
+ private StringBuffer getAdjustedString(String input) {
+ final StringBuffer adjustedString = new StringBuffer();
+ for (int i = 0; i < gramLength - 1; i++) {
+ adjustedString.append(QGRAMSTARTPADDING);
+ }
+ adjustedString.append(input);
+ for (int i = 0; i < gramLength - 1; i++) {
+ adjustedString.append(QGRAMENDPADDING);
+ }
+ return adjustedString;
+ }
+
+ public List<String> tokenize(String input) {
+ final ArrayList<String> returnVect = new ArrayList<String>();
+ final StringBuffer adjustedString = getAdjustedString(input);
+ int curPos = 0;
+ final int length = adjustedString.length() - (gramLength - 1);
+ final HashMap<String, Integer> grams = new HashMap<String, Integer>();
+ while (curPos < length) {
+ final String term = adjustedString.substring(curPos, curPos + gramLength);
+ Integer count = grams.get(term);
+ if (count == null) {
+ count = new Integer(0);
+ }
+ count++;
+ grams.put(term, count);
+ returnVect.add(term + count);
+ curPos++;
+ }
+ return returnVect;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..362126f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
@@ -0,0 +1,116 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
+
+ private int gramLength;
+ private boolean usePrePost;
+
+ private int gramNum;
+ private int totalGrams;
+
+ private final INGramToken concreteToken;
+
+ public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ this.gramLength = gramLength;
+ this.usePrePost = usePrePost;
+ concreteToken = (INGramToken) token;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (gramNum < totalGrams) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void next() {
+ int currentTokenStart = index;
+ int tokenCount = 1;
+ int numPreChars = 0;
+ int numPostChars = 0;
+ if (usePrePost) {
+ numPreChars = Math.max(gramLength - gramNum - 1, 0);
+ numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0;
+ }
+ gramNum++;
+
+ concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+ if (numPreChars == 0) {
+ index += StringUtils.charSize(data, index);
+ }
+
+ // compute token count
+ // ignore pre and post grams for duplicate detection
+ if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+ int tmpIndex = start;
+ while (tmpIndex < currentTokenStart) {
+ tokenCount++; // assume found
+ int offset = 0;
+ for (int j = 0; j < gramLength; j++) {
+ if (StringUtils.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != StringUtils
+ .toLowerCase(StringUtils.charAt(data, tmpIndex + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, tmpIndex + offset);
+ }
+ tmpIndex += StringUtils.charSize(data, tmpIndex);
+ }
+ }
+
+ // set token
+ token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ super.reset(data, start, length);
+ gramNum = 0;
+
+ int numChars = 0;
+ int pos = index;
+ int end = pos + utf8Length;
+ while (pos < end) {
+ numChars++;
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ if (usePrePost) {
+ totalGrams = numChars + gramLength - 1;
+ } else {
+ totalGrams = numChars - gramLength + 1;
+ }
+ }
+
+ public void setGramlength(int gramLength) {
+ this.gramLength = gramLength;
+ }
+
+ public void setPrePost(boolean usePrePost) {
+ this.usePrePost = usePrePost;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/StringUtils.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/StringUtils.java
new file mode 100644
index 0000000..48d61e7
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/StringUtils.java
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class StringUtils {
+ public static char charAt(byte[] b, int s) {
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return (char) c;
+
+ case 12:
+ case 13:
+ return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+
+ case 14:
+ return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
+
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+
+ public static int charSize(byte[] b, int s) {
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return 1;
+
+ case 12:
+ case 13:
+ return 2;
+
+ case 14:
+ return 3;
+ }
+ throw new IllegalStateException();
+ }
+
+ public static int getModifiedUTF8Len(char c) {
+ if (c >= 0x0000 && c <= 0x007F) {
+ return 1;
+ } else if (c <= 0x07FF) {
+ return 2;
+ } else {
+ return 3;
+ }
+ }
+
+ public static int getStrLen(byte[] b, int s) {
+ int pos = s + 2;
+ int end = pos + getUTFLen(b, s);
+ int charCount = 0;
+ while (pos < end) {
+ charCount++;
+ pos += charSize(b, pos);
+ }
+ return charCount;
+ }
+
+ public static int getUTFLen(byte[] b, int s) {
+ return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
+ }
+
+ public static char toLowerCase(char c) {
+ switch (c) {
+ case 'A':
+ return 'a';
+ case 'B':
+ return 'b';
+ case 'C':
+ return 'c';
+ case 'D':
+ return 'd';
+ case 'E':
+ return 'e';
+ case 'F':
+ return 'f';
+ case 'G':
+ return 'g';
+ case 'H':
+ return 'h';
+ case 'I':
+ return 'i';
+ case 'J':
+ return 'j';
+ case 'K':
+ return 'k';
+ case 'L':
+ return 'l';
+ case 'M':
+ return 'm';
+ case 'N':
+ return 'n';
+ case 'O':
+ return 'o';
+ case 'P':
+ return 'p';
+ case 'Q':
+ return 'q';
+ case 'R':
+ return 'r';
+ case 'S':
+ return 's';
+ case 'T':
+ return 't';
+ case 'U':
+ return 'u';
+ case 'V':
+ return 'v';
+ case 'W':
+ return 'w';
+ case 'X':
+ return 'x';
+ case 'Y':
+ return 'y';
+ case 'Z':
+ return 'z';
+ case 'Ä':
+ return 'ä';
+ case 'Ǟ':
+ return 'ǟ';
+ case 'Ë':
+ return 'ë';
+ case 'Ḧ':
+ return 'ḧ';
+ case 'Ï':
+ return 'ï';
+ case 'Ḯ':
+ return 'ḯ';
+ case 'Ö':
+ return 'ö';
+ case 'Ȫ':
+ return 'ȫ';
+ case 'Ṏ':
+ return 'ṏ';
+ case 'Ü':
+ return 'ü';
+ case 'Ǖ':
+ return 'ǖ';
+ case 'Ǘ':
+ return 'ǘ';
+ case 'Ǚ':
+ return 'ǚ';
+ case 'Ǜ':
+ return 'ǜ';
+ case 'Ṳ':
+ return 'ṳ';
+ case 'Ṻ':
+ return 'ṻ';
+ case 'Ẅ':
+ return 'ẅ';
+ case 'Ẍ':
+ return 'ẍ';
+ case 'Ÿ':
+ return 'ÿ';
+ default:
+ // since I probably missed some chars above
+ // use Java to convert to lower case to be safe
+ return Character.toLowerCase(c);
+ }
+ }
+
+ public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
+
+ if (c >= 0x0000 && c <= 0x007F) {
+ dos.writeByte(c);
+ } else if (c <= 0x07FF) {
+ dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ } else {
+ dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+ dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ }
+ }
+
+ public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
+ dos.write((len >>> 8) & 0xFF);
+ dos.write((len >>> 0) & 0xFF);
+ }
+}
\ No newline at end of file
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Token.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Token.java
new file mode 100644
index 0000000..6c6d365
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Token.java
@@ -0,0 +1,118 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+
+public class Token implements Serializable {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ private CharSequence data;
+ private int start;
+ private int length;
+ private int count;
+
+ /** Cache the hash code for the string */
+ private int hash; // Default to 0
+
+ public Token() {
+ }
+
+ public Token(CharSequence data, int start, int length, int count) {
+ set(data, start, length, count);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (!(o instanceof Token)) {
+ return false;
+ }
+ Token t = (Token) o;
+ if (t.length != length) {
+ return false;
+ }
+ for (int i = 0; i < length; i++) {
+ if (t.data.charAt(t.start + i) != data.charAt(start + i)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public CharSequence getCharSequence() {
+ return data;
+ }
+
+ public int getCount() {
+ return count;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ @Override
+ public int hashCode() {
+ int h = hash;
+ if (h == 0 && length > 0) {
+ for (int i = 0; i < length; i++) {
+ h = 31 * h + data.charAt(start + i);
+ }
+ h = 31 * h + count;
+ hash = h;
+ }
+ return h;
+ }
+
+ public int length() {
+ return length;
+ }
+
+ public void set(CharSequence data, int start, int length, int count) {
+ this.data = data;
+ this.start = start;
+ this.length = length;
+ this.count = count;
+ hash = 0;
+ }
+
+ public void set(String data, int count) {
+ this.data = data;
+ start = 0;
+ length = data.length();
+ this.count = count;
+ hash = 0;
+ }
+
+ @Override
+ public String toString() {
+ return "(" + data.subSequence(start, start + length) + ", " + count + ")";
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Tokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Tokenizer.java
new file mode 100644
index 0000000..312c626
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Tokenizer.java
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+import java.util.List;
+
+public interface Tokenizer extends Serializable {
+ public List<String> tokenize(String text);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
new file mode 100644
index 0000000..76dc588
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public interface TokenizerBuffered {
+ public void advance();
+
+ public boolean end();
+
+ public Token getToken();
+
+ public void reset();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
new file mode 100644
index 0000000..8be793c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class TokenizerBufferedFactory {
+ public static TokenizerBuffered getTokenizer(String tokenizer, StringBuilder buffer) {
+ if (tokenizer.equals("Word")) {
+ return new WordTokenizerBuffered(buffer);
+ }
+ throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
+ }
+
+ public static boolean isSeparator(char c) {
+ return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
new file mode 100644
index 0000000..dad43e9
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class TokenizerFactory {
+ public static Tokenizer getTokenizer(String tokenizer, String wordSeparator, char tokenSeparator) {
+ if (tokenizer.equals("NGram")) {
+ return new NGramTokenizer();
+ } else if (tokenizer.equals("Word")) {
+ return new WordTokenizer(wordSeparator, tokenSeparator);
+ }
+ throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
new file mode 100644
index 0000000..07a5e0d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
+
+ public final static char PRECHAR = '#';
+
+ public final static char POSTCHAR = '$';
+
+ protected int numPreChars;
+ protected int numPostChars;
+
+ public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public int getNumPostChars() {
+ return numPreChars;
+ }
+
+ @Override
+ public int getNumPreChars() {
+ return numPostChars;
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ // regular chars
+ int numRegChars = tokenLength - numPreChars - numPostChars;
+
+ // assuming pre and post char need 1-byte each in utf8
+ int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars;
+
+ // write utf8 length indicator
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+ }
+
+ int pos = start;
+ for (int i = 0; i < numRegChars; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+ }
+ }
+
+ public void setNumPrePostChars(int numPreChars, int numPostChars) {
+ this.numPreChars = numPreChars;
+ this.numPostChars = numPostChars;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
new file mode 100644
index 0000000..4417b9c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public UTF8NGramTokenFactory() {
+ super();
+ }
+
+ public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new UTF8NGramToken(tokenTypeTag, countTypeTag);
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
new file mode 100644
index 0000000..aacbfd8
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class UTF8WordToken extends AbstractUTF8Token {
+
+ public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+ int pos = start;
+ for (int i = 0; i < tokenLength; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
new file mode 100644
index 0000000..666f6bb
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public UTF8WordTokenFactory() {
+ super();
+ }
+
+ public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new UTF8WordToken(tokenTypeTag, countTypeTag);
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
new file mode 100644
index 0000000..963ad33
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+public class WordTokenizer implements Tokenizer {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ public static void main(String args[]) {
+ Tokenizer tokenizer = new WordTokenizer("_", '_');
+ String a = "hadoop_rocks_in_java";
+ System.out.println(a + ":" + tokenizer.tokenize(a));
+ }
+
+ private final String wordSeparator;
+ private final char tokenSeparator;
+
+ public WordTokenizer() {
+ this(" ", '_');
+ }
+
+ public WordTokenizer(String wordSeparator, char tokenSeparator) {
+ this.wordSeparator = wordSeparator;
+ this.tokenSeparator = tokenSeparator;
+ }
+
+ public List<String> tokenize(String input) {
+ final ArrayList<String> returnVect = new ArrayList<String>();
+ final HashMap<String, Integer> tokens = new HashMap<String, Integer>();
+ for (String term : input.split(wordSeparator)) {
+ if (term.length() == 0) {
+ continue;
+ }
+ Integer count = tokens.get(term);
+ if (count == null) {
+ count = 0;
+ }
+ count++;
+ tokens.put(term, count);
+ returnVect.add(term + tokenSeparator + count);
+ }
+ return returnVect;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
new file mode 100644
index 0000000..5a64d24
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
@@ -0,0 +1,92 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import edu.uci.ics.asterix.fuzzyjoin.IntArray;
+
+public class WordTokenizerBuffered implements TokenizerBuffered {
+
+ private final StringBuilder buffer;
+ private int index;
+ private final Token token;
+
+ private final IntArray tokensStart, tokensLength;
+
+ public WordTokenizerBuffered(StringBuilder buffer) {
+ this.buffer = buffer;
+ token = new Token();
+ tokensStart = new IntArray();
+ tokensLength = new IntArray();
+ reset();
+ }
+
+ @Override
+ public void advance() {
+ while (index < buffer.length() && TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
+ index++;
+ }
+ int start = index;
+ while (index < buffer.length() && !TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
+ buffer.setCharAt(index, Character.toLowerCase(buffer.charAt(index)));
+ index++;
+ }
+ int length = index - start;
+ int count = 1;
+ if (length > 0) {
+ // search if we got the same token before
+ for (int i = 0; i < tokensStart.length(); ++i) {
+ if (length == tokensLength.get(i)) {
+ int tokenStart = tokensStart.get(i);
+ count++; // assume we found it
+ for (int j = 0; j < length; ++j) {
+ if (buffer.charAt(start + j) != buffer.charAt(tokenStart + j)) {
+ count--; // token not found
+ break;
+ }
+ }
+ }
+ }
+ // add the new token to the list of seen tokens
+ tokensStart.add(start);
+ tokensLength.add(length);
+ }
+ // set token
+ token.set(buffer, start, length, count);
+ }
+
+ @Override
+ public boolean end() {
+ return token.length() <= 0;
+ }
+
+ @Override
+ public Token getToken() {
+ return token;
+ }
+
+ @Override
+ public void reset() {
+ index = 0;
+ tokensStart.reset();
+ tokensLength.reset();
+ advance();
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
new file mode 100644
index 0000000..d8a9185
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.Serializable;
+
+public interface IntTokenCountRank extends Serializable {
+ public int add(int token, int count);
+
+ public int getRank(int token, int count);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
new file mode 100644
index 0000000..acefb51
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.HashMap;
+
+import edu.uci.ics.asterix.fuzzyjoin.IntPair;
+
+public class IntTokenCountRankFrequency implements IntTokenCountRank {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ private final HashMap<IntPair, Integer> ranksMap = new HashMap<IntPair, Integer>();
+ private final IntPair tmpPair = new IntPair();
+ private int crtRank = 0;
+
+ @Override
+ public int add(int token, int count) {
+ int prevRank = crtRank;
+ ranksMap.put(new IntPair(token, count), prevRank);
+ crtRank++;
+ return prevRank;
+ }
+
+ @Override
+ public int getRank(int token, int count) {
+ tmpPair.set(token, count);
+ Integer rank = ranksMap.get(tmpPair);
+ if (rank == null) {
+ return -1;
+ }
+ return rank;
+ }
+
+ @Override
+ public String toString() {
+ return "[" + crtRank + ",\n " + ranksMap + "\n]";
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRank.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
new file mode 100644
index 0000000..a70a4d9
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.Serializable;
+
+public interface IntTokenRank extends Serializable {
+ public int add(int token);
+
+ public int getRank(int token);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
new file mode 100644
index 0000000..da83a69
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.HashMap;
+
+public class IntTokenRankFrequency implements IntTokenRank {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ private final HashMap<Integer, Integer> ranksMap = new HashMap<Integer, Integer>();
+ private int crtRank = 0;
+
+ @Override
+ public int add(int token) {
+ int prevRank = crtRank;
+ ranksMap.put(token, prevRank);
+ crtRank++;
+ return prevRank;
+ }
+
+ @Override
+ public int getRank(int token) {
+ Integer rank = ranksMap.get(token);
+ if (rank == null) {
+ return -1;
+ }
+ return rank;
+ }
+
+ @Override
+ public String toString() {
+ return "[" + crtRank + ",\n " + ranksMap + "\n]";
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenLoad.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenLoad.java
new file mode 100644
index 0000000..da6e47c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenLoad.java
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Serializable;
+
+import edu.uci.ics.asterix.fuzzyjoin.FuzzyJoinConfig;
+
+public class TokenLoad implements Serializable {
+ private final String path;
+ private final TokenRank rank;
+
+ public TokenLoad(String path, TokenRank rank) {
+ this.path = path;
+ this.rank = rank;
+ }
+
+ public void loadTokenRank() {
+ loadTokenRank(1);
+ }
+
+ public void loadTokenRank(int factor) {
+ try {
+ BufferedReader fis = new BufferedReader(
+ // new FileReader(path.toString())
+ new InputStreamReader(new FileInputStream(path), "UTF-8"));
+ String token = null;
+ while ((token = fis.readLine()) != null) {
+ rank.add(token);
+ // only used when increasing the token dictionary
+ for (int i = 1; i < factor; i++) {
+ // remove _COUNT at the end of the token (it is removed in
+ // the new records anyway)
+ rank.add(token.split(FuzzyJoinConfig.TOKEN_SEPARATOR_REGEX)[0] + i);
+ }
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+}
\ No newline at end of file
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRank.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRank.java
new file mode 100644
index 0000000..6222bea
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRank.java
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.Serializable;
+import java.util.Collection;
+
+public interface TokenRank extends Serializable {
+ public int add(String token);
+
+ public Integer getRank(String token);
+
+ public Collection<Integer> getTokenRanks(Iterable<String> tokens);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
new file mode 100644
index 0000000..ab7bf2f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.Collection;
+import java.util.HashMap;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Token;
+
+public class TokenRankBufferedFrequency implements TokenRank {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ private final HashMap<Token, Integer> ranksMap = new HashMap<Token, Integer>();
+ private int crtRank = 0;
+
+ public int add(String stringWithCount) {
+ int end = stringWithCount.lastIndexOf('_');
+ int count = 0;
+ for (int i = end + 1; i < stringWithCount.length(); ++i) {
+ count = count * 10 + (stringWithCount.charAt(i) - '0');
+ }
+ return add(stringWithCount.substring(0, end), count);
+ }
+
+ public int add(String string, int count) {
+ Token token = new Token(string, 0, string.length(), count);
+ return add(token);
+ }
+
+ public int add(Token token) {
+ int prevRank = crtRank;
+ ranksMap.put(token, prevRank);
+ crtRank++;
+ return prevRank;
+ }
+
+ @Override
+ public Integer getRank(String token) {
+ throw new UnsupportedOperationException();
+ }
+
+ public Integer getRank(Token token) {
+ return ranksMap.get(token);
+ }
+
+ @Override
+ public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public String toString() {
+ return "[" + crtRank + ",\n " + ranksMap + "\n]";
+ }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
new file mode 100644
index 0000000..0ae6c35
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.TreeSet;
+
+public class TokenRankFrequency implements TokenRank {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ private final HashMap<String, Integer> ranksMap = new HashMap<String, Integer>();
+ private int crtRank = 0;
+
+ public int add(String token) {
+ int prevRank = crtRank;
+ ranksMap.put(token, prevRank);
+ crtRank++;
+ return prevRank;
+ }
+
+ public Integer getRank(String token) {
+ return ranksMap.get(token);
+ }
+
+ public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
+ TreeSet<Integer> ranksCol = new TreeSet<Integer>();
+ for (String token : tokens) {
+ Integer rank = getRank(token);
+ if (rank != null) {
+ ranksCol.add(rank);
+ }
+ }
+ return ranksCol;
+ }
+
+ @Override
+ public String toString() {
+ return "[" + crtRank + ",\n " + ranksMap + "\n]";
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java
new file mode 100644
index 0000000..f5a8cec9
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.util.ArrayList;
+
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.FuzzyJoinMemory;
+import edu.uci.ics.asterix.fuzzyjoin.ResultSelfJoin;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset.Directory;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.DBLPSmallDataset;
+
+public class FuzzyJoinTest {
+
+ private static final AbstractDataset dataset = new DBLPSmallDataset();
+ private static final String base = "data/";
+
+ @Test
+ public void test() throws Exception {
+
+ ArrayList<int[]> records = new ArrayList<int[]>();
+ ArrayList<Integer> rids = new ArrayList<Integer>();
+ ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+
+ dataset.createDirecotries(new String[] { base });
+
+ FuzzyJoinMemory fj = new FuzzyJoinMemory(dataset.getThreshold());
+
+ FuzzyJoinMemory.readRecords(base + dataset.getPathPart0(Directory.SSJOININ), records, rids);
+
+ for (int[] record : records) {
+ results.addAll(fj.selfJoinAndAddRecord(record));
+ }
+
+ BufferedWriter out = new BufferedWriter(new FileWriter(base + dataset.getPathPart0(Directory.SSJOINOUT)));
+ for (ResultSelfJoin result : results) {
+ out.write(String.format("%d %d %.3f\n", rids.get(result.indexX), rids.get(result.indexY), result.similarity));
+ }
+ out.close();
+
+ FuzzyJoinTestUtil.verifyDirectory(base + dataset.getPathPart0(Directory.SSJOINOUT),
+ base + dataset.getPathExpected(Directory.SSJOINOUT));
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java
new file mode 100644
index 0000000..db44850
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashSet;
+
+import org.junit.Assert;
+
+public class FuzzyJoinTestUtil {
+
+ public static void verifyDirectory(String pathTest, String pathCorrect)
+ throws IOException {
+ verifyDirectory(pathTest, pathCorrect, false);
+ }
+
+ public static void verifyDirectory(String pathTest, String pathCorrect,
+ boolean noDup) throws IOException {
+ int countTest = 0, countTestDedup = 0, countCorrect = 0;
+
+ BufferedReader input;
+ String line;
+ HashSet<String> buffer = new HashSet<String>();
+
+ // buffer Test
+ input = new BufferedReader(new FileReader(pathTest));
+ while ((line = input.readLine()) != null) {
+ buffer.add(line);
+ countTest++;
+ }
+ countTestDedup = buffer.size();
+
+ // probe Correct
+ input = new BufferedReader(new FileReader(new File(pathCorrect)));
+ while ((line = input.readLine()) != null) {
+ Assert.assertTrue(buffer.contains(line));
+ countCorrect++;
+ }
+
+ // check counts
+ Assert.assertEquals(countTestDedup, countCorrect);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
new file mode 100644
index 0000000..e65bb25
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
@@ -0,0 +1,239 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8NGramTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8NGramTokenFactory;
+
+public class NGramTokenizerTest {
+
+ private char PRECHAR = '#';
+ private char POSTCHAR = '$';
+
+ private String str = "Jürgen S. Generic's Car";
+ private byte[] inputBuffer;
+
+ private int gramLength = 3;
+
+ private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
+
+ String tmp = s.toLowerCase();
+ if (prePost) {
+ StringBuilder preBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ preBuilder.append(PRECHAR);
+ }
+ String pre = preBuilder.toString();
+
+ StringBuilder postBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ postBuilder.append(POSTCHAR);
+ }
+ String post = postBuilder.toString();
+
+ tmp = pre + s.toLowerCase() + post;
+ }
+
+ for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
+ String gram = tmp.substring(i, i + gramLength);
+ grams.add(gram);
+ }
+ }
+
+ @Before
+ public void init() throws Exception {
+ // serialize string into bytes
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput dos = new DataOutputStream(baos);
+ dos.writeUTF(str);
+ inputBuffer = baos.toByteArray();
+ }
+
+ void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
+ false, tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
+ for (String s : expectedGrams) {
+ Integer count = gramCounts.get(s);
+ if (count == null) {
+ count = 1;
+ gramCounts.put(s, count);
+ } else {
+ count++;
+ }
+
+ int hash = tokenHash(s, count);
+ expectedHashedGrams.add(hash);
+ }
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedGram = in.readInt();
+
+ // System.out.println(hashedGram);
+
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
+
+ void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ for (String s : expectedGrams) {
+ int hash = tokenHash(s, 1);
+ expectedHashedGrams.add(hash);
+ }
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedGram = in.readInt();
+
+ // System.out.println(hashedGram);
+
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
+
+ void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
+ UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ String strGram = in.readUTF();
+
+ // System.out.println("\"" + strGram + "\"");
+
+ Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
+
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
+
+ @Test
+ public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
+ }
+
+ @Test
+ public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithHashedUTF8Tokens(true);
+ }
+
+ @Test
+ public void testNGramTokenizerWithUTF8Tokens() throws IOException {
+ runTestNGramTokenizerWithUTF8Tokens(false);
+ runTestNGramTokenizerWithUTF8Tokens(true);
+ }
+
+ public int tokenHash(String token, int tokenCount) {
+ int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+ for (int i = 0; i < token.length(); i++) {
+ h ^= token.charAt(i);
+ h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+ }
+ return h + tokenCount;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java
new file mode 100644
index 0000000..8fd05da
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java
@@ -0,0 +1,214 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8WordTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
+
+public class WordTokenizerTest {
+
+ private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
+ private byte[] inputBuffer;
+
+ private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
+ private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
+ private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
+
+ @Before
+ public void init() throws IOException {
+ // serialize text into bytes
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput dos = new DataOutputStream(baos);
+ dos.writeUTF(text);
+ inputBuffer = baos.toByteArray();
+
+ // init expected string tokens
+ expectedUTF8Tokens.add("hello");
+ expectedUTF8Tokens.add("world");
+ expectedUTF8Tokens.add("i");
+ expectedUTF8Tokens.add("would");
+ expectedUTF8Tokens.add("like");
+ expectedUTF8Tokens.add("to");
+ expectedUTF8Tokens.add("inform");
+ expectedUTF8Tokens.add("you");
+ expectedUTF8Tokens.add("of");
+ expectedUTF8Tokens.add("the");
+ expectedUTF8Tokens.add("importance");
+ expectedUTF8Tokens.add("of");
+ expectedUTF8Tokens.add("foo");
+ expectedUTF8Tokens.add("bar");
+ expectedUTF8Tokens.add("yes");
+ expectedUTF8Tokens.add("foo");
+ expectedUTF8Tokens.add("bar");
+ expectedUTF8Tokens.add("jürgen");
+
+ // hashed tokens ignoring token count
+ for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+ int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
+ expectedHashedUTF8Tokens.add(hash);
+ }
+
+ // hashed tokens using token count
+ HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
+ for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+ Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
+ if (count == null) {
+ count = 1;
+ tokenCounts.put(expectedUTF8Tokens.get(i), count);
+ } else {
+ count++;
+ }
+
+ int hash = tokenHash(expectedUTF8Tokens.get(i), count);
+ expectedCountedHashedUTF8Tokens.add(hash);
+ }
+ }
+
+ @Test
+ public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
+
+ HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
+ tokenFactory);
+
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedToken = in.readInt();
+
+ // System.out.println(hashedToken);
+
+ Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
+
+ tokenCount++;
+ }
+ }
+
+ @Test
+ public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
+
+ HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedToken = in.readInt();
+
+ // System.out.println(hashedToken);
+
+ Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
+
+ tokenCount++;
+ }
+ }
+
+ @Test
+ public void testWordTokenizerWithUTF8Tokens() throws IOException {
+
+ UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ String strToken = in.readUTF();
+
+ // System.out.println(strToken);
+
+ Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
+
+ tokenCount++;
+ }
+ }
+
+ // JAQL
+ public int tokenHash(String token, int tokenCount) {
+ int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+ for (int i = 0; i < token.length(); i++) {
+ h ^= token.charAt(i);
+ h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+ }
+ return h + tokenCount;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java
new file mode 100644
index 0000000..5ca6c6d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.io.File;
+import java.util.NoSuchElementException;
+
+public abstract class AbstractDataset {
+ public static enum Directory {
+ RAW_R,
+ RAW_S,
+ RECORDPAIRS,
+ RECORDS_R,
+ RECORDS_S,
+ RECORDSBULK_R,
+ RECORDSBULK_S,
+ RIDPAIRS,
+ SSJOININ,
+ SSJOINOUT,
+ TOKENS,
+ TOKENS_R,
+ TOKENS_R_AQL,
+ }
+
+ public static enum Relation {
+ R, S,
+ }
+
+ public static final String FILE_PART = "part-";
+ public static final String FILE_PART0 = FILE_PART + "00000";
+ public static final String FILE_EXPECTED = "expected.txt";
+ public static final String AQL = "aql";
+
+ public static final String PATH_RAW = "raw";
+ public static final String PATH_RECORDPAIRS = "recordpairs";
+ public static final String PATH_RECORDS = "records";
+ public static final String PATH_RECORDSBULK = "recordsbulk";
+ public static final String PATH_RIDPAIRS = "ridpairs";
+ public static final String PATH_SSJOININ = "ssjoin.in";
+ public static final String PATH_SSJOINOUT = "ssjoin.out";
+ public static final String PATH_TOKENS = "tokens";
+
+ public static final String DIRECTORY_ID_FORMAT = "%03d";
+
+ public void createDirecotries(String[] paths) {
+ createDirecotries(paths, 0);
+ }
+
+ public void createDirecotries(String[] paths, int crtCopy) {
+ (new File(paths[0] + getPathDirecotry(Directory.SSJOINOUT, 0))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_R, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_S, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDS_R, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDS_S, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.TOKENS, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R_AQL, crtCopy))).mkdir();
+ }
+
+ public abstract String getName();
+
+ public abstract int getNoRecords();
+
+ public abstract String getPath();
+
+ public String getPathDirecotry(Directory directory, int crtCopy) {
+ return getPathDirectory(getPath(), directory, crtCopy);
+ }
+
+ private String getPathDirectory(Directory directory, int crtCopy, boolean expected) {
+ return getPathDirectory(getName() + (expected ? ".expected" : ""), directory, crtCopy);
+ }
+
+ public String getPathDirectory(String path, Directory directory, int crtCopy) {
+ path += '/';
+ switch (directory) {
+ case SSJOININ:
+ path += AbstractDataset.PATH_SSJOININ;
+ break;
+ case SSJOINOUT:
+ path += AbstractDataset.PATH_SSJOINOUT;
+ break;
+ case RAW_R:
+ path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.R);
+ break;
+ case RAW_S:
+ path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.S);
+ break;
+ case RECORDSBULK_R:
+ path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.R);
+ break;
+ case RECORDSBULK_S:
+ path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.S);
+ break;
+ case RECORDS_R:
+ path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.R);
+ break;
+ case RECORDS_S:
+ path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.S);
+ break;
+ case TOKENS:
+ path += AbstractDataset.PATH_TOKENS;
+ break;
+ case TOKENS_R:
+ path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R);
+ break;
+ case TOKENS_R_AQL:
+ path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R) + "." + AQL;
+ break;
+ case RIDPAIRS:
+ path += AbstractDataset.PATH_RIDPAIRS;
+ break;
+ case RECORDPAIRS:
+ path += AbstractDataset.PATH_RECORDPAIRS;
+ break;
+ default:
+ throw new NoSuchElementException();
+ }
+ return path + "-" + String.format(DIRECTORY_ID_FORMAT, crtCopy);
+ }
+
+ public String getPathExpected(Directory directory) {
+ return getPathDirectory(directory, 0, true) + '/' + FILE_EXPECTED;
+ }
+
+ public String getPathPart(Directory directory, int crtCopy) {
+ return getPathDirecotry(directory, crtCopy) + '/' + FILE_PART;
+ }
+
+ public String getPathPart0(Directory directory) {
+ return getPathDirectory(directory, 0, false) + '/' + FILE_PART0;
+ }
+
+ public String getPathPart0(Directory directory, boolean expected) {
+ return getPathDirectory(directory, 0, expected) + '/' + (expected ? FILE_EXPECTED : FILE_PART0);
+ }
+
+ public abstract String getSuffix(Relation relation);
+
+ public abstract float getThreshold();
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java
new file mode 100644
index 0000000..5333cad
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java
@@ -0,0 +1,5 @@
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public abstract class AbstractTokenizableDataset extends AbstractDataset {
+ public abstract String getRecordData();
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java
new file mode 100644
index 0000000..3783829
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+
+public class DBLPDataset extends PublicationsDataset {
+ private static final String NAME = "dblp";
+ private static final int NO_RECORDS = 1268017;
+ private static final float THRESHOLD = .8f;
+ private static final String RECORD_DATA = "2,3";
+
+ public DBLPDataset() {
+ super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, NAME, NAME);
+ }
+
+ public DBLPDataset(String recordData) {
+ super(NAME, NO_RECORDS, THRESHOLD, recordData, NAME, NAME);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java
new file mode 100644
index 0000000..5eaebd2
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class DBLPSmallDataset extends PublicationsDataset {
+ public DBLPSmallDataset() {
+ super("dblp-small", 100, .5f, "2,3", "dblp", "dblp");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java
new file mode 100644
index 0000000..38aad37
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArrayBagSmallDataset extends AbstractDataset {
+ private final int NO_RECORDS = 4;
+ private final String NAME = "intarray-bag-small";
+ private final String PATH = NAME;
+ private final float THRESHOLD = .5f;
+
+ public IntArrayBagSmallDataset() {
+ }
+
+ @Override
+ public String getName() {
+ return NAME;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return NO_RECORDS;
+ }
+
+ @Override
+ public String getPath() {
+ return PATH;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ return "r";
+ }
+
+ @Override
+ public float getThreshold() {
+ return THRESHOLD;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java
new file mode 100644
index 0000000..7c8c80d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArraySetSmallDataset extends AbstractDataset {
+ private final int NO_RECORDS = 4;
+ private final String NAME = "intarray-set-small";
+ private final String PATH = NAME;
+ private final float THRESHOLD = .5f;
+
+ public IntArraySetSmallDataset() {
+ }
+
+ @Override
+ public String getName() {
+ return NAME;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return NO_RECORDS;
+ }
+
+ @Override
+ public String getPath() {
+ return PATH;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ return "r";
+ }
+
+ @Override
+ public float getThreshold() {
+ return THRESHOLD;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java
new file mode 100644
index 0000000..e8c2f2a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBDataset extends PublicationsDataset {
+ private static final String DBLP_SUFFIX = "dblp";
+ private static final String CSX_SUFFIX = "csx";
+ private static final String NAME = "pub";
+ private static final int NO_RECORDS = 1385532;
+ private static final float THRESHOLD = .8f;
+ private static final String RECORD_DATA = "2,3";
+
+ public PUBDataset() {
+ super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+ }
+
+ public PUBDataset(float threshold) {
+ super(NAME, NO_RECORDS, threshold, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+ }
+
+ public PUBDataset(float threshold, String recordData) {
+ super(NAME, NO_RECORDS, threshold, recordData, DBLP_SUFFIX, CSX_SUFFIX);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java
new file mode 100644
index 0000000..eed28e4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBSmallDataset extends PublicationsDataset {
+ public PUBSmallDataset() {
+ super("pub-small", 100, .5f, "2,3", "dblp", "csx");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java
new file mode 100644
index 0000000..e1653cd
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class PublicationsDataset extends AbstractTokenizableDataset {
+ protected final String name;
+ protected final String path;
+ protected final int noRecords;
+ protected final float threshold;
+ protected final String recordData;
+ protected final String rSuffix, sSuffix;
+
+ public PublicationsDataset(String name, int noRecords, float threshold, String recordData, String rSuffix,
+ String sSuffix) {
+ this.name = name;
+ this.noRecords = noRecords;
+ this.threshold = threshold;
+ this.recordData = recordData;
+ this.rSuffix = rSuffix;
+ this.sSuffix = sSuffix;
+
+ path = name;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return noRecords;
+ }
+
+ @Override
+ public String getPath() {
+ return path;
+ }
+
+ @Override
+ public String getRecordData() {
+ return recordData;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ switch (relation) {
+ case R:
+ return rSuffix;
+ case S:
+ return sSuffix;
+ default:
+ throw new NoSuchElementException();
+ }
+ }
+
+ @Override
+ public float getThreshold() {
+ return threshold;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java
new file mode 100644
index 0000000..6463b2d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class UsersVisitorsSmallDataset extends AbstractDataset {
+ private final int NO_RECORDS = 4;
+ private final String NAME = "users-visitors-small";
+ private static final String USERS_SUFFIX = "users";
+ private static final String VISITORS_SUFFIX = "visitors";
+ private final String PATH = NAME;
+ private final float THRESHOLD = .5f;
+
+ public UsersVisitorsSmallDataset() {
+ }
+
+ @Override
+ public String getName() {
+ return NAME;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return NO_RECORDS;
+ }
+
+ @Override
+ public String getPath() {
+ return PATH;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ switch (relation) {
+ case R:
+ return USERS_SUFFIX;
+ case S:
+ return VISITORS_SUFFIX;
+ default:
+ throw new NoSuchElementException();
+ }
+ }
+
+ @Override
+ public float getThreshold() {
+ return THRESHOLD;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/scripts/conf.sh b/asterix-fuzzyjoin/src/test/scripts/conf.sh
new file mode 100644
index 0000000..45e962c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/conf.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+### http://www.cse.unsw.edu.au/~weiw/project/simjoin.html
+SSJOIN=/home/rares/workspace/ssjoin-bin
+DATA=../data
+
+IN=ssjoin.in-000
+OUT=ssjoin.out-000
diff --git a/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh
new file mode 100755
index 0000000..0cd6ccc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1 # Required number of arguments
+E_BADARGS=85 # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+ echo "Usage: `basename $0` dataset"
+ echo "Example: `basename $0` dblp-small"
+ exit $E_BADARGS
+fi
+
+THR="0.80"
+if [ "$1" == "dblp-small" ]; then
+ THR="0.50"
+fi
+
+
+mkdir $DATA/$1.expected/$OUT
+$SSJOIN/ppjoinplus j $THR $DATA/$1/$IN/part-00000 | \
+ sed 's/0\.812/0\.813/' | \
+ sort > $DATA/$1.expected/$OUT/expected.txt
+
+mkdir $DATA/$1/$OUT
+java \
+ -Xmx2g \
+ -jar $DIR/../../../target/fuzzyjoin-core-0.0.1.jar \
+ $THR $DATA/$1/$IN/part-00000 | \
+ sort > $DATA/$1/$OUT/part-00000
+
+diff $DATA/$1.expected/$OUT/expected.txt $DATA/$1/$OUT/part-00000
diff --git a/asterix-fuzzyjoin/src/test/scripts/inmemory.sh b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh
new file mode 100755
index 0000000..c9394d4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DATA="/home/rares/data/fuzzyjoin/dblp/records-024"
+FUZZYJOIN="/home/rares/fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar"
+
+echo "-- - Step 0: Project and append length - --"
+
+# java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinAppendLength $DATA/part-00000 $DATA/part-00000-len
+
+date
+
+echo "== START =="
+
+echo "-- - Step 1: Sort by length - --"
+
+# time sort -n -k 5 -t ":" $DATA/part-00000-len > $DATA/part-00000-len-sorted
+
+echo "-- - Step 2: Tokenize - --"
+
+# time java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinTokenize $DATA/part-00000-len-sorted $DATA/part-00000-tokens $DATA/part-00000-tokenized
+
+echo "-- - Step 3: RID pairs - --"
+
+time java -Xmx8g -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinMemory .8 $DATA/part-00000-tokenized > $DATA/part-00000-ridpairs
+
+echo "== END =="
+
+date
+
+
+### SSJoin ###
+# cut -d ":" -f 3,4 records-000/part-0000* > ! ssjoin.raw-000/part-00000
+# ~/workspace/ssjoin-bin/txtformat ssjoin.raw-000/part-00000 ssjoin.norm-000/part-00000 l
+# sed 's/_\+/ /g' ssjoin.norm-000/part-00000 > ! ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/tokenizer ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/ppjoinplus j .8 ssjoin.space-000/part-00000.bin > /dev/null
+# java -jar /fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar .8 ssjoin.space-000/part-00000.bin > /dev/null
diff --git a/asterix-fuzzyjoin/src/test/scripts/tokenize.sh b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh
new file mode 100755
index 0000000..5498d44
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1 # Required number of arguments
+E_BADARGS=85 # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+ echo "Usage: `basename $0` dataset"
+ echo "Example: `basename $0` dblp-small"
+ exit $E_BADARGS
+fi
+
+$SSJOIN/tokenizer $DATA/$1/raw-000/part-00000 $2
+mkdir $DATA/$1/$IN
+mv $DATA/$1/raw-000/part-00000.bin $DATA/$1/$IN/part-00000
+