adding fuzzyjoin code to git
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyFiltersJaccard.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyFiltersJaccard.java
new file mode 100644
index 0000000..75029a8
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyFiltersJaccard.java
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class FuzzyFiltersJaccard {
+
+    /**
+     * type is double because with float .8 / (1 + .8) * (8 + 10) = 8.0...01
+     */
+    protected final double simThr;
+    protected final double simThrExpr;
+
+    public FuzzyFiltersJaccard(double similarityThreshold) {
+        simThr = similarityThreshold;
+        simThrExpr = simThr / (1 + simThr);
+    }
+
+    public int getIndexPrefixLength(int length) {
+        return length - (int) Math.ceil(2 * simThrExpr * length) + 1;
+    }
+
+    public int getIntersectLowerBound(int lengthX, int lengthY) {
+        return (int) Math.ceil(simThrExpr * (lengthX + lengthY));
+    }
+
+    public long getIntersectLowerBound(long lengthX, long lengthY) {
+        return (long) Math.ceil(simThrExpr * (lengthX + lengthY));
+    }
+
+    public int getIntersectUpperBound(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
+        return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
+    }
+
+    public long getIntersectUpperBound(int noGramsCommon, long positionX, long positionY, long lengthX, long lengthY) {
+        return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
+    }
+
+    public int getLengthLowerBound(int length) {
+        return (int) Math.ceil(simThr * length);
+    }
+
+    public long getLengthLowerBound(long length) {
+        return (long) Math.ceil(simThr * length);
+    }
+
+    public int getPrefixLength(int length) {
+        return length - (int) Math.ceil(simThr * length) + 1;
+    }
+
+    public long getPrefixLength(long length) {
+        return length - (long) Math.ceil(simThr * length) + 1;
+    }
+
+    public double getSimilarityThreshold() {
+        return simThr;
+    }
+
+    public boolean passLengthFilter(int lengthX, int lengthY) {
+        return getLengthLowerBound(lengthX) <= lengthY && lengthY <= 1 / simThr * lengthX;
+    }
+
+    public boolean passLengthFilter(long lengthX, long lengthY) {
+        return getLengthLowerBound(lengthX) <= lengthY && lengthY <= 1 / simThr * lengthX;
+    }
+
+    /**
+     * @param noGramsCommon
+     *            number of grams in common
+     * @param positionX
+     *            position of the last gram in common on X
+     * @param positionY
+     *            position of the last gram in common on X
+     * @param lengthX
+     * @param lengthY
+     * @return
+     */
+    public boolean passPositionFilter(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
+        return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX, lengthY) >= getIntersectLowerBound(
+                lengthX, lengthY);
+    }
+
+    public boolean passPositionFilter(int noGramsCommon, long positionX, long positionY, long lengthX, long lengthY) {
+        return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX, lengthY) >= getIntersectLowerBound(
+                lengthX, lengthY);
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinAppendLength.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
new file mode 100644
index 0000000..b08aa29
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
+
+public class FuzzyJoinAppendLength {
+    public static void main(String args[]) throws IOException {
+        final String inputFileName = args[0];
+        final String outputFileName = args[1];
+
+        BufferedReader input = new BufferedReader(new FileReader(inputFileName));
+        BufferedWriter output = new BufferedWriter(new FileWriter(outputFileName));
+
+        Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
+                FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
+
+        int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
+
+        String line;
+        HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>();
+        while ((line = input.readLine()) != null) {
+            String[] splits = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
+            Collection<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns,
+                    FuzzyJoinConfig.TOKEN_SEPARATOR));
+            output.write(splits[0] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[1] + FuzzyJoinConfig.RECORD_SEPARATOR
+                    + splits[2] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[3] + FuzzyJoinConfig.RECORD_SEPARATOR
+                    + tokens.size() + "\n");
+        }
+
+        input.close();
+        output.close();
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinConfig.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinConfig.java
new file mode 100644
index 0000000..03806a6
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinConfig.java
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class FuzzyJoinConfig {
+    private static final String NAMESPACE = "fuzzyjoin";
+    //
+    // tokenizer
+    //
+    public static final String TOKENIZER_PROPERTY = NAMESPACE + ".tokenizer";
+    public static final String TOKENIZER_VALUE = "Word";
+    //
+    // similarity
+    //
+    public static final String SIMILARITY_NAME_PROPERTY = NAMESPACE + ".similarity.name";
+    public static final String SIMILARITY_NAME_VALUE = "Jaccard";
+    public static final String SIMILARITY_THRESHOLD_PROPERTY = NAMESPACE + ".similarity.threshold";
+    public static final float SIMILARITY_THRESHOLD_VALUE = .8f;
+    //
+    // record
+    //
+    public static final String RECORD_DATA_PROPERTY = NAMESPACE + ".record.data";
+    public static final String RECORD_DATA_VALUE = "1";
+    public static final String RECORD_DATA1_PROPERTY = NAMESPACE + ".record.data1";
+    public static final String RECORD_DATA1_VALUE = "1";
+    //
+    // data
+    //    
+    public static final String DATA_TOKENS_PROPERTY = NAMESPACE + ".data.tokens";
+    //
+    // const
+    //
+    public static final String RECORD_DATA_VALUE_SEPARATOR_REGEX = ",";
+    public static final char WORD_SEPARATOR = '_';
+    public static final String WORD_SEPARATOR_REGEX = "_";
+    public static final char TOKEN_SEPARATOR = '_';
+    public static final String TOKEN_SEPARATOR_REGEX = "_";
+    public static final int RECORD_KEY = 0;
+    //
+    // separators
+    //
+    public static final char TOKEN_RANK_SEPARATOR = '_';
+    public static final char RECORD_SEPARATOR = ':';
+    public static final String RECORD_SEPARATOR_REGEX = ":";
+    public static final char RECORD_EXTRA_SEPARATOR = ';';
+    public static final String RECORD_EXTRA_SEPARATOR_REGEX = ";";
+    public static final char RIDPAIRS_SEPARATOR = ' ';
+    public static final String RIDPAIRS_SEPARATOR_REGEX = " ";
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinContext.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinContext.java
new file mode 100644
index 0000000..738bdc8
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinContext.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.util.ArrayList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard;
+
+public class FuzzyJoinContext {
+    public final float similarityThreshold;
+    public final SimilarityFiltersJaccard similarityFilters;
+    public final ArrayList<int[]> records;
+    public final ArrayList<ResultSelfJoin> results;
+
+    public FuzzyJoinContext(float similarityThreshold) {
+        this.similarityThreshold = similarityThreshold;
+        similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
+        records = new ArrayList<int[]>();
+        results = new ArrayList<ResultSelfJoin>();
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinMemory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinMemory.java
new file mode 100644
index 0000000..ec2a411
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinMemory.java
@@ -0,0 +1,317 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import edu.uci.ics.asterix.fuzzyjoin.invertedlist.InvertedListLengthList;
+import edu.uci.ics.asterix.fuzzyjoin.invertedlist.InvertedListsLengthList;
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard;
+
+public class FuzzyJoinMemory {
+    public static void main(String[] args) {
+        if (args.length < 2) {
+            System.err.println("Usage: <threshold> <file> [no runs, e.g., 1] [warm-up factor, e.g., 1]");
+            System.exit(2);
+        }
+
+        float similarityThreshold = Float.valueOf(args[0]);
+        String fileName = args[1];
+
+        int noRuns = 1, warmUpFactor = 1;
+        if (args.length > 2) {
+            noRuns = Integer.valueOf(args[2]);
+            if (args.length > 3) {
+                warmUpFactor = Integer.valueOf(args[3]);
+            }
+        }
+
+        System.err.println("Document: " + fileName);
+        System.err.println("... LOADING DATASET ...");
+
+        ArrayList<int[]> records = new ArrayList<int[]>();
+        ArrayList<Integer> rids = new ArrayList<Integer>();
+
+        FuzzyJoinMemory fj = new FuzzyJoinMemory(similarityThreshold);
+
+        FuzzyJoinMemory.readRecords(fileName, records, rids);
+
+        System.err.println("Algorithm: ppjoin");
+        System.err.println("Threshold: Jaccard " + similarityThreshold);
+
+        List<ResultSelfJoin> results = fj.runs(records, noRuns, warmUpFactor);
+
+        for (ResultSelfJoin result : results) {
+            System.out.format("%d %d %.3f", rids.get(result.indexX), rids.get(result.indexY), result.similarity);
+            System.out.println();
+            // System.out.format("(" + result.indexX + "," + result.indexY +
+            // ")");
+            // System.out.println();
+            // System.out.format("(" + rids.get(result.indexX) + ","
+            // + rids.get(result.indexY) + ")\t" + result.similarity);
+            // System.out.println();
+        }
+    }
+
+    public static void readRecords(String fileName, List<int[]> records, List<Integer> rids) {
+        LittleEndianIntInputStream in;
+        try {
+            in = new LittleEndianIntInputStream(new BufferedInputStream(new FileInputStream(fileName)));
+        } catch (FileNotFoundException e) {
+            throw new RuntimeException(e);
+        }
+
+        while (true) {
+            int rid = 0;
+            try {
+                rid = in.readInt();
+            } catch (IOException e) {
+                // FILE_EXPECTED reach of EOF
+                break;
+            }
+
+            rids.add(rid);
+            int[] record;
+
+            try {
+                int size = in.readInt();
+                record = new int[size];
+                for (int j = 0; j < size; j++) {
+                    int token = in.readInt();
+                    record[j] = token;
+                }
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+
+            records.add(record);
+        }
+    }
+
+    private final InvertedListsLengthList invertedLists;
+    private final SimilarityFiltersJaccard similarityFilters;
+
+    private final ArrayList<int[]> records;
+
+    public FuzzyJoinMemory(float similarityThreshold) {
+        invertedLists = new InvertedListsLengthList();
+        similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
+        records = new ArrayList<int[]>();
+    }
+
+    public void add(final int[] tokens) {
+        final int index = records.size();
+        final int length = tokens.length;
+        final int indexPrefixLength = similarityFilters.getPrefixLength(length);
+
+        for (int indexToken = 0; indexToken < indexPrefixLength; indexToken++) {
+            invertedLists.index(tokens[indexToken], new int[] { index, indexToken, length });
+        }
+        records.add(tokens);
+    }
+
+    public ArrayList<ResultJoin> join(final int[] tokens, final int length) {
+        final int prefixLength = similarityFilters.getPrefixLength(length);
+        final int lengthLowerBound = similarityFilters.getLengthLowerBound(length);
+        //
+        // self join
+        //
+        final HashMap<Integer, Integer> counts = new HashMap<Integer, Integer>();
+        for (int indexToken = 0; indexToken < Math.min(prefixLength, tokens.length); indexToken++) {
+            final int token = tokens[indexToken];
+            //
+            // probe index
+            //
+            InvertedListLengthList invertedList = invertedLists.get(token);
+            if (invertedList != null) {
+                // length filter
+                invertedList.setMinLength(lengthLowerBound);
+                for (int[] element : invertedList) {
+                    final int indexProbe = element[0];
+                    final int indexTokenProbe = element[1];
+                    final int lengthProbe = element[2];
+                    Integer count = counts.get(indexProbe);
+                    if (count == null) {
+                        count = 0;
+                    }
+
+                    if (count != -1) {
+                        count++;
+                        // position filter
+                        if (!similarityFilters.passPositionFilter(count, indexToken, length, indexTokenProbe,
+                                lengthProbe)) {
+                            count = -1;
+                        }
+                        // suffix filter
+                        if (count == 1
+                                && !similarityFilters.passSuffixFilter(tokens, indexToken, records.get(indexProbe),
+                                        indexTokenProbe)) {
+                            count = -1;
+                        }
+                        counts.put(indexProbe, count);
+                    }
+                }
+            }
+        }
+        //
+        // verify candidates
+        //
+        ArrayList<ResultJoin> results = new ArrayList<ResultJoin>();
+        for (Map.Entry<Integer, Integer> cand : counts.entrySet()) {
+            int count = cand.getValue();
+            int indexProbe = cand.getKey();
+            if (count > 0) {
+                int tokensProbe[] = records.get(indexProbe);
+                float similarity = similarityFilters.passSimilarityFilter(tokens, prefixLength, tokensProbe,
+                        similarityFilters.getPrefixLength(tokensProbe.length), count);
+                if (similarity > 0) {
+                    results.add(new ResultJoin(indexProbe, similarity));
+                }
+            }
+        }
+        return results;
+    }
+
+    public void prune(int length) {
+        final int lengthLowerBound = similarityFilters.getLengthLowerBound(length + 1);
+        invertedLists.prune(lengthLowerBound);
+    }
+
+    public List<ResultSelfJoin> runs(Collection<int[]> records, int noRuns, int warmupFactor) {
+        if (records.size() < 2) {
+            return new ArrayList<ResultSelfJoin>();
+        }
+
+        int noRunsTotal = noRuns * warmupFactor;
+        float runtime = 0, runtimeAverage = 0;
+        ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+
+        System.err.println("# Records: " + records.size());
+        System.err.print("=== BEGIN JOIN (TIMER STARTED) === ");
+        for (int i = 1; i <= noRunsTotal; i++) {
+            System.err.print(".");
+            System.err.flush();
+
+            results.clear();
+            Runtime.getRuntime().gc();
+
+            Date startTime = new Date();
+            for (int[] record : records) {
+                results.addAll(selfJoinAndAddRecord(record));
+            }
+            Date endTime = new Date();
+            runtime = (endTime.getTime() - startTime.getTime()) / (float) 1000.0;
+
+            if (i >= noRunsTotal - noRuns) {
+                runtimeAverage += runtime;
+            }
+        }
+        System.err.println();
+        System.err.println("# Results: " + results.size());
+        System.err.println("=== END JOIN (TIMER STOPPED) ===");
+        System.err.println("Total Running Time:  " + runtimeAverage / noRuns + " (" + runtime + ")");
+        System.err.println();
+        return results;
+    }
+
+    public ArrayList<ResultSelfJoin> selfJoinAndAddRecord(final int[] tokens) {
+        final int index = records.size();
+        final int length = tokens.length;
+        final int prefixLength = similarityFilters.getPrefixLength(length);
+        final int indexPrefixLength = similarityFilters.getIndexPrefixLength(length);
+        final int lengthLowerBound = similarityFilters.getLengthLowerBound(length);
+        //
+        // self join
+        //
+        final HashMap<Integer, Integer> counts = new HashMap<Integer, Integer>();
+        for (int indexToken = 0; indexToken < prefixLength; indexToken++) {
+            final int token = tokens[indexToken];
+            //
+            // probe index
+            //
+            InvertedListLengthList invertedList = invertedLists.get(token);
+            if (invertedList != null) {
+                // length filter
+                invertedList.setMinLength(lengthLowerBound);
+                for (int[] element : invertedList) {
+                    final int indexProbe = element[0];
+                    final int indexTokenProbe = element[1];
+                    final int lengthProbe = element[2];
+                    Integer count = counts.get(indexProbe);
+                    if (count == null) {
+                        count = 0;
+                    }
+
+                    if (count != -1) {
+                        count++;
+                        // position filter
+                        if (!similarityFilters.passPositionFilter(count, indexToken, length, indexTokenProbe,
+                                lengthProbe)) {
+                            count = -1;
+                        }
+                        // suffix filter
+                        if (count == 1
+                                && !similarityFilters.passSuffixFilter(tokens, indexToken, records.get(indexProbe),
+                                        indexTokenProbe)) {
+                            count = -1;
+                        }
+                        counts.put(indexProbe, count);
+                    }
+                }
+            }
+            //
+            // add to index
+            //
+            if (indexToken < indexPrefixLength) {
+                invertedLists.index(token, new int[] { index, indexToken, length });
+            }
+        }
+        //
+        // add record
+        //
+        records.add(tokens);
+        //
+        // verify candidates
+        //
+        ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+        for (Map.Entry<Integer, Integer> cand : counts.entrySet()) {
+            int count = cand.getValue();
+            int indexProbe = cand.getKey();
+            if (count > 0) {
+                int tokensProbe[] = records.get(indexProbe);
+                float similarity = similarityFilters.passSimilarityFilter(tokens, prefixLength, tokensProbe,
+                        similarityFilters.getIndexPrefixLength(tokensProbe.length), count);
+                if (similarity > 0) {
+                    results.add(new ResultSelfJoin(index, indexProbe, similarity));
+                }
+            }
+        }
+        return results;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinTokenize.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinTokenize.java
new file mode 100644
index 0000000..aa8927d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinTokenize.java
@@ -0,0 +1,135 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenorder.TokenLoad;
+import edu.uci.ics.asterix.fuzzyjoin.tokenorder.TokenRank;
+import edu.uci.ics.asterix.fuzzyjoin.tokenorder.TokenRankFrequency;
+
+public class FuzzyJoinTokenize {
+    public static class TokenCount implements Comparable {
+        public String token;
+        public MutableInteger count;
+
+        public TokenCount(String token, MutableInteger count) {
+            this.token = token;
+            this.count = count;
+        }
+
+        @Override
+        public int compareTo(Object o) {
+            TokenCount tc = (TokenCount) o;
+            return count.compareTo(tc.count);
+        }
+
+        public String getToken() {
+            return token;
+        }
+
+        @Override
+        public String toString() {
+            return token + " " + count;
+        }
+    }
+
+    public static void main(String args[]) throws IOException {
+        final String inputFileName = args[0];
+        final String tokensFileName = args[1];
+        final String tokenizedFileName = args[2];
+
+        BufferedReader input = new BufferedReader(new FileReader(inputFileName));
+
+        Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
+                FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
+
+        int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
+
+        String line;
+        HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>();
+        while ((line = input.readLine()) != null) {
+            Collection<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(
+                    line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX), dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
+
+            for (String token : tokens) {
+                MutableInteger count = tokenCount.get(token);
+                if (count == null) {
+                    tokenCount.put(token, new MutableInteger(1));
+                } else {
+                    count.inc();
+                }
+            }
+        }
+
+        input.close();
+
+        ArrayList<TokenCount> tokenCounts = new ArrayList<TokenCount>();
+        for (Map.Entry<String, MutableInteger> entry : tokenCount.entrySet()) {
+            tokenCounts.add(new TokenCount(entry.getKey(), entry.getValue()));
+        }
+        Collections.sort(tokenCounts);
+
+        BufferedWriter outputTokens = new BufferedWriter(new FileWriter(tokensFileName));
+        for (TokenCount tc : tokenCounts) {
+            outputTokens.write(tc.getToken() + "\n");
+        }
+        outputTokens.close();
+
+        TokenRank tokenRank = new TokenRankFrequency();
+        TokenLoad tokenLoad = new TokenLoad(tokensFileName, tokenRank);
+        tokenLoad.loadTokenRank();
+
+        input = new BufferedReader(new FileReader(inputFileName));
+        LittleEndianIntOutputStream outputTokenized = new LittleEndianIntOutputStream(new BufferedOutputStream(
+                new FileOutputStream(tokenizedFileName)));
+        while ((line = input.readLine()) != null) {
+            String splits[] = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
+            int rid = Integer.parseInt(splits[FuzzyJoinConfig.RECORD_KEY]);
+            outputTokenized.writeInt(rid);
+            Collection<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns,
+                    FuzzyJoinConfig.TOKEN_SEPARATOR));
+            Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
+            outputTokenized.writeInt(tokensRanked.size());
+            for (Integer token : tokensRanked) {
+                outputTokenized.writeInt(token);
+            }
+            // for (int i = 0; i < tokens.size() - tokensRanked.size(); i++) {
+            // outputTokenized.writeInt(Integer.MAX_VALUE);
+            // }
+        }
+
+        input.close();
+        outputTokenized.close();
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinUtil.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinUtil.java
new file mode 100644
index 0000000..d947e24
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/FuzzyJoinUtil.java
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.util.regex.Pattern;
+
+public class FuzzyJoinUtil {
+    private static final Pattern rePunctuation = Pattern.compile("[^\\p{L}\\p{N}]"); // L:Letter, N:Number
+    private static final Pattern reSpaceOrAnderscore = Pattern.compile("(_|\\s)+");
+
+    public static String clean(String in) {
+        /*
+         * - remove punctuation
+         * 
+         * - normalize case
+         * 
+         * - remove extra spaces
+         * 
+         * - repalce space with FuzzyJoinDriver.TOKEN_SEPARATOR
+         */
+
+        in = rePunctuation.matcher(in).replaceAll(" ");
+        in = reSpaceOrAnderscore.matcher(in).replaceAll(" ");
+        in = in.trim();
+        in = in.replace(' ', '_');
+        in = in.toLowerCase();
+        return in;
+    }
+
+    /**
+     * @param splits
+     *            splitted record
+     * @param dataColumns
+     *            column index of data columns
+     * @param tokenSeparator
+     *            TODO
+     * @return concatenation of data column values
+     */
+    public static String getData(Object[] splits, int[] dataColumns, char tokenSeparator) {
+        String data = null;
+        for (int dataColumn : dataColumns) {
+            if (data != null) {
+                data += tokenSeparator;
+            }
+            if (splits.length > dataColumn) {
+                if (data == null) {
+                    data = "";
+                }
+                // data += splits[dataColumns[i]];
+                data += clean(splits[dataColumn].toString());
+            }
+        }
+        return data;
+    }
+
+    /**
+     * @param columnsString
+     *            string containing the indexes of the columns containing data
+     * @return array of data columns indexes
+     */
+    public static int[] getDataColumns(String columnsString) {
+        String[] splits = columnsString.split(FuzzyJoinConfig.RECORD_DATA_VALUE_SEPARATOR_REGEX);
+        int[] columns = new int[splits.length];
+        for (int i = 0; i < splits.length; i++) {
+            columns[i] = Integer.parseInt(splits[i]);
+        }
+        return columns;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntArray.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntArray.java
new file mode 100644
index 0000000..7170e87
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntArray.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.util.Arrays;
+
+public class IntArray {
+    private static final int SIZE = 128;
+
+    private int[] data;
+    private int length;
+
+    public IntArray() {
+        data = new int[SIZE];
+        length = 0;
+    }
+
+    public void add(int d) {
+        if (length == data.length) {
+            data = Arrays.copyOf(data, data.length << 1);
+        }
+        data[length++] = d;
+    }
+
+    public int[] get() {
+        return data;
+    }
+
+    public int get(int i) {
+        return data[i];
+    }
+
+    public int length() {
+        return length;
+    }
+
+    public void reset() {
+        length = 0;
+    }
+
+    public void sort() {
+        sort(0, length);
+    }
+
+    public void sort(int start, int end) {
+        Arrays.sort(data, start, end);
+    }
+
+    @Override
+    public String toString() {
+        StringBuilder out = new StringBuilder();
+        out.append('[');
+        for (int i = 0; i < length; ++i) {
+            out.append(data[i]);
+            if (i < length - 1) {
+                out.append(',');
+                out.append(' ');
+            }
+        }
+        out.append(']');
+        return out.toString();
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntPair.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntPair.java
new file mode 100644
index 0000000..1d54e82
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/IntPair.java
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class IntPair {
+
+    public static int[] PRIME = new int[] { 17, 31 }; // used for hashCode
+    // computations
+
+    protected int first;
+
+    protected int second;
+
+    public IntPair() {
+    }
+
+    public IntPair(int first, int second) {
+        this.first = first;
+        this.second = second;
+    }
+
+    public int compareTo(Object o) {
+        if (this == o) {
+            return 0;
+        }
+        IntPair p = (IntPair) o;
+        if (first != p.first) {
+            return first < p.first ? -1 : 1;
+        }
+        return second < p.second ? -1 : second > p.second ? 1 : 0;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (o == null) {
+            return false;
+        }
+        if (this == o) {
+            return true;
+        }
+        if (!(o instanceof IntPair)) {
+            return false;
+        }
+        IntPair p = (IntPair) o;
+        return first == p.first && second == p.second;
+    }
+
+    public int getFirst() {
+        return first;
+    }
+
+    public int getSecond() {
+        return second;
+    }
+
+    @Override
+    public int hashCode() {
+        return first * PRIME[0] + second;
+    }
+
+    public void set(int first, int second) {
+        this.first = first;
+        this.second = second;
+    }
+
+    public void setFirst(int first) {
+        this.first = first;
+    }
+
+    public void setSecond(int second) {
+        this.second = second;
+    }
+
+    @Override
+    public String toString() {
+        return "(" + first + "," + second + ")";
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntInputStream.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntInputStream.java
new file mode 100644
index 0000000..e17b694
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntInputStream.java
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.EOFException;
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+public class LittleEndianIntInputStream extends FilterInputStream {
+
+    public LittleEndianIntInputStream(InputStream in) {
+        super(in);
+    }
+
+    public int readInt() throws IOException {
+        int a = read();
+        if (a == -1) {
+            throw new EOFException();
+        }
+        int b = read();
+        if (b == -1) {
+            throw new EOFException();
+        }
+        int c = read();
+        if (c == -1) {
+            throw new EOFException();
+        }
+        int d = read();
+        if (d == -1) {
+            throw new EOFException();
+        }
+        return (a | (b << 8) | (c << 16) | (d << 24)); // little endian
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntOutputStream.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntOutputStream.java
new file mode 100644
index 0000000..8b45338
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/LittleEndianIntOutputStream.java
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+import java.io.FilterOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+public class LittleEndianIntOutputStream extends FilterOutputStream {
+    public LittleEndianIntOutputStream(OutputStream in) {
+        super(in);
+    }
+
+    public void writeInt(int v) throws IOException {
+        write((byte) (0xff & v));
+        write((byte) (0xff & (v >> 8)));
+        write((byte) (0xff & (v >> 16)));
+        write((byte) (0xff & (v >> 24)));
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/MutableInteger.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/MutableInteger.java
new file mode 100644
index 0000000..8dd8748
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/MutableInteger.java
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class MutableInteger implements Comparable {
+    private int v;
+
+    public MutableInteger(int v) {
+        this.v = v;
+    }
+
+    @Override
+    public int compareTo(Object o) {
+        MutableInteger m = (MutableInteger) o;
+        return v - m.v;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (o == null) {
+            return false;
+        }
+        if (o == this) {
+            return true;
+        }
+        if (!(o instanceof MutableInteger)) {
+            return false;
+        }
+        MutableInteger m = (MutableInteger) o;
+        if (m.v == v) {
+            return true;
+        }
+        return false;
+    }
+
+    public int get() {
+        return v;
+    }
+
+    public void inc() {
+        v += 1;
+    }
+
+    public void set(int v) {
+        this.v = v;
+    }
+
+    @Override
+    public String toString() {
+        return "" + v;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/RIDPairSimilarity.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/RIDPairSimilarity.java
new file mode 100644
index 0000000..64a3ea1
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/RIDPairSimilarity.java
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class RIDPairSimilarity {
+    public int rid1, rid2;
+    public float similarity;
+
+    public RIDPairSimilarity() {
+    }
+
+    public RIDPairSimilarity(int rid1, int rid2, float similarity) {
+        set(rid1, rid2, similarity);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        RIDPairSimilarity r = (RIDPairSimilarity) o;
+        return rid1 == r.rid1 && rid2 == r.rid2;
+    }
+
+    @Override
+    public int hashCode() {
+        return rid1 * rid2 * (rid1 - rid2);
+    }
+
+    public void set(int rid1, int rid2, float similarity) {
+        this.rid1 = rid1;
+        this.rid2 = rid2;
+        this.similarity = similarity;
+    }
+
+    @Override
+    public String toString() {
+        return "{(" + rid1 + ", " + rid2 + "), " + similarity + "}";
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultJoin.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultJoin.java
new file mode 100644
index 0000000..f4b6c34
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultJoin.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class ResultJoin {
+    public int index;
+    public float similarity;
+
+    public ResultJoin(int index, float similarity) {
+        this.index = index;
+        this.similarity = similarity;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultSelfJoin.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultSelfJoin.java
new file mode 100644
index 0000000..9e2658c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/ResultSelfJoin.java
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin;
+
+public class ResultSelfJoin {
+    public int indexX, indexY;
+    public float similarity;
+
+    public ResultSelfJoin(int indexX, int indexY, float similarity) {
+        this.indexX = indexX;
+        this.indexY = indexY;
+        this.similarity = similarity;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedList.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedList.java
new file mode 100644
index 0000000..fe0553c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedList.java
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+
+public interface InvertedList extends Iterable<int[]> {
+    public void add(int[] element);
+
+    public void setMinLength(int length);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthFixed.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthFixed.java
new file mode 100644
index 0000000..ed8db19
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthFixed.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.Iterator;
+
+public class InvertedListLengthFixed implements InvertedList {
+    public class ListIterator implements Iterator<int[]> {
+
+        int ix;
+
+        public ListIterator(int ix) {
+            this.ix = ix;
+        }
+
+        public boolean hasNext() {
+            return ix < sz;
+        }
+
+        public int[] next() {
+            return list[ix++];
+        }
+
+        public void remove() {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    private final int[][] list;
+
+    private int sz, ix;
+
+    public InvertedListLengthFixed(int size) {
+        list = new int[size * 3][3];
+        sz = 0;
+        ix = 0;
+    }
+
+    public void add(int[] element) {
+        list[sz++] = element;
+    }
+
+    public int getIndex() {
+        return ix;
+    }
+
+    public int getSize() {
+        return sz;
+    }
+
+    public Iterator<int[]> iterator() {
+        // return Arrays.asList(list).iterator();
+        return new ListIterator(ix);
+    }
+
+    public void setMinLength(int minLength) {
+        while (ix < sz && list[ix][2] < minLength) {
+            ix++;
+        }
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthList.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthList.java
new file mode 100644
index 0000000..6322040
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListLengthList.java
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+public class InvertedListLengthList implements InvertedList {
+    /**
+     * @author rares
+     *         assumes that ListLength(s) are not empty inside the
+     *         InvertedListLength
+     */
+    private class ListIterator implements Iterator<int[]> {
+
+        private final Iterator<ListLength> iteratorLength;
+        private Iterator<int[]> iteratorList;
+
+        public ListIterator() {
+            iteratorLength = list.iterator();
+            iteratorList = null;
+        }
+
+        public boolean hasNext() {
+            return (iteratorList != null && iteratorList.hasNext()) || iteratorLength.hasNext();
+        }
+
+        public int[] next() {
+            if (iteratorList != null && iteratorList.hasNext()) {
+                return iteratorList.next();
+            }
+            iteratorList = iteratorLength.next().list.iterator();
+            return iteratorList.next();
+        }
+
+        public void remove() {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    private class ListLength {
+        public int length;
+        public ArrayList<int[]> list = new ArrayList<int[]>();
+
+        @Override
+        public String toString() {
+            StringBuffer l = new StringBuffer("(");
+            for (int[] i : list) {
+                l.append(Arrays.toString(i));
+                l.append(",");
+            }
+            l.append(")");
+            return "(length:" + length + "," + l + ")";
+        }
+    }
+
+    private LinkedList<ListLength> list;
+
+    public InvertedListLengthList() {
+        list = new LinkedList<ListLength>();
+    }
+
+    public void add(int[] element) {
+        if (!list.isEmpty() && list.getLast().length == element[2]) {
+            list.getLast().list.add(element);
+        } else {
+            ListLength listLength = new ListLength();
+            listLength.length = element[2];
+            listLength.list.add(element);
+            list.add(listLength);
+        }
+    }
+
+    public Iterator<int[]> iterator() {
+        return new ListIterator();
+    }
+
+    public void setMinLength(int minLength) {
+        while (!list.isEmpty() && list.getFirst().length < minLength) {
+            list.removeFirst();
+        }
+    }
+
+    @Override
+    public String toString() {
+        return list.toString();
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListPlain.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListPlain.java
new file mode 100644
index 0000000..b8c37c5
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListPlain.java
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+public class InvertedListPlain {
+    private List<int[]> list;
+
+    public InvertedListPlain() {
+        list = new ArrayList<int[]>();
+    }
+
+    public boolean add(int[] element) {
+        list.add(element);
+        return true;
+    }
+
+    public Iterator<int[]> iterator() {
+        return list.iterator();
+    }
+
+    public void setMinLength(int minLength) {
+    }
+
+    @Override
+    public String toString() {
+        return list.toString();
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedLists.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedLists.java
new file mode 100644
index 0000000..7696a38
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedLists.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+public interface InvertedLists {
+    public InvertedList get(int token);
+
+    public void index(int token, int[] element);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthFixed.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthFixed.java
new file mode 100644
index 0000000..db21980
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthFixed.java
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.Map;
+
+public class InvertedListsLengthFixed implements InvertedLists {
+    public final InvertedListLengthFixed[] invertedLists;
+    public final Map<Integer, Integer> invertedListsSize;
+
+    public InvertedListsLengthFixed(int noTokens, Map<Integer, Integer> invertedListsSize) {
+        invertedLists = new InvertedListLengthFixed[noTokens];
+        this.invertedListsSize = invertedListsSize;
+    }
+
+    public InvertedList get(int token) {
+        return invertedLists[token];
+    }
+
+    public void index(int token, int[] element) {
+        if (invertedLists[token] == null) {
+            // invertedLists[token] = new InvertedListLengthList();
+            invertedLists[token] = new InvertedListLengthFixed(invertedListsSize.get(token));
+        }
+        invertedLists[token].add(element);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthList.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthList.java
new file mode 100644
index 0000000..21f24b7
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/invertedlist/InvertedListsLengthList.java
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.invertedlist;
+
+import java.util.HashMap;
+
+public class InvertedListsLengthList implements InvertedLists {
+    public final HashMap<Integer, InvertedListLengthList> invertedLists;
+
+    public InvertedListsLengthList() {
+        invertedLists = new HashMap<Integer, InvertedListLengthList>();
+    }
+
+    public InvertedListLengthList get(int token) {
+        return invertedLists.get(token);
+    }
+
+    public void index(int token, int[] element) {
+        InvertedListLengthList list = invertedLists.get(token);
+        if (list == null) {
+            list = new InvertedListLengthList();
+            invertedLists.put(token, list);
+        }
+        list.add(element);
+    }
+
+    public void prune(int minLength) {
+        for (InvertedListLengthList l : invertedLists.values()) {
+            l.setMinLength(minLength);
+        }
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroup.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroup.java
new file mode 100644
index 0000000..9341a60
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroup.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public abstract class RecordGroup {
+    protected final int noGroups;
+    protected final SimilarityFilters fuzzyFilters;
+
+    public RecordGroup(int noGroups, SimilarityFilters fuzzyFilters) {
+        this.noGroups = noGroups;
+        this.fuzzyFilters = fuzzyFilters;
+    }
+
+    public abstract Iterable<Integer> getGroups(Integer token, Integer length);
+
+    public abstract boolean isLengthOnly();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupFactory.java
new file mode 100644
index 0000000..9b60d4e
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupFactory.java
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupFactory {
+    public static RecordGroup getRecordGroup(String recordGroup, int noGroups, SimilarityFilters fuzzyFilters,
+            String lengthstatsPath) {
+        if (recordGroup.equals("LengthCount")) {
+            return new RecordGroupLengthCount(noGroups, fuzzyFilters, lengthstatsPath);
+        } else if (recordGroup.equals("LengthIdentity")) {
+            return new RecordGroupLengthIdentity(noGroups, fuzzyFilters);
+        } else if (recordGroup.equals("LengthRange")) {
+            return new RecordGroupLengthRange(noGroups, fuzzyFilters, lengthstatsPath);
+        } else if (recordGroup.equals("Single")) {
+            return new RecordGroupSingle(noGroups, fuzzyFilters);
+        } else if (recordGroup.equals("TokenIdentity")) {
+            return new RecordGroupTokenIdentity(noGroups, fuzzyFilters);
+        } else if (recordGroup.equals("TokenFrequency")) {
+            return new RecordGroupTokenFrequency(noGroups, fuzzyFilters);
+        } else if (recordGroup.equals("TokenFrequencyMirror")) {
+            return new RecordGroupTokenFrequencyMirror(noGroups, fuzzyFilters);
+        }
+        throw new RuntimeException("Unknown record group \"" + recordGroup + "\".");
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthCount.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthCount.java
new file mode 100644
index 0000000..785346a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthCount.java
@@ -0,0 +1,111 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupLengthCount extends RecordGroup {
+    private final int min;
+    private final int max;
+    private final int[] lengthGroups;
+
+    public RecordGroupLengthCount(int noGroups, SimilarityFilters fuzzyFilters, String lengthstatsPath) {
+        super(noGroups, fuzzyFilters);
+
+        int sum = 0;
+        int range = 0;
+
+        try {
+            DataInputStream in = new DataInputStream(new FileInputStream(lengthstatsPath.toString()));
+            min = in.readInt();
+            max = in.readInt();
+            range = max - min + 1;
+            lengthGroups = new int[range]; // stores freqencies initally
+            try {
+                while (true) {
+                    int length = in.readInt();
+                    int freq = in.readInt();
+
+                    int lowAbs = fuzzyFilters.getLengthLowerBound(length);
+                    int uppAbs = fuzzyFilters.getLengthUpperBound(length);
+
+                    int low = Math.max(lowAbs - min, 0);
+                    int upp = Math.min(uppAbs - min, max - min);
+
+                    for (int l = low; l <= upp; ++l) {
+                        lengthGroups[l] += freq;
+                        sum += freq;
+                    }
+                }
+            } catch (EOFException e) {
+            }
+        } catch (IOException ioe) {
+            throw new RuntimeException(ioe);
+        }
+
+        int countGroup = sum / noGroups;
+        int count = 0;
+        int group = 0;
+        for (int i = 0; i < range; ++i) {
+            count += lengthGroups[i];
+            lengthGroups[i] = group;
+            if (count >= countGroup && group < noGroups - 1) {
+                count = 0;
+                group++;
+            }
+        }
+    }
+
+    @Override
+    public Iterable<Integer> getGroups(Integer token, Integer length) {
+        int lowAbs = fuzzyFilters.getLengthLowerBound(length);
+        int uppAbs = fuzzyFilters.getLengthUpperBound(length);
+
+        int low = Math.max(lowAbs - min, 0);
+        int upp = Math.min(uppAbs - min, max - min);
+
+        ArrayList<Integer> groups = new ArrayList<Integer>(upp - low + 1);
+        int prevGroup = -1;
+        for (int l = low; l <= upp; ++l) {
+            int group = lengthGroups[l];
+            if (group != prevGroup) {
+                groups.add(group);
+                prevGroup = group;
+            }
+        }
+
+        // System.out.println(noGroups + ":" + length + " [" + low + "," + upp
+        // + "] " + groups);
+
+        return groups;
+    }
+
+    @Override
+    public boolean isLengthOnly() {
+        return true;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthIdentity.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthIdentity.java
new file mode 100644
index 0000000..558eada
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthIdentity.java
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupLengthIdentity extends RecordGroup {
+
+    private final Collection<Integer> groups;
+
+    public RecordGroupLengthIdentity(int noGroups, SimilarityFilters fuzzyFilters) {
+        super(noGroups, fuzzyFilters);
+        groups = new LinkedList<Integer>();
+    }
+
+    @Override
+    public Iterable<Integer> getGroups(Integer token, Integer length) {
+        groups.clear();
+        groups.add(length);
+        return groups;
+    }
+
+    @Override
+    public boolean isLengthOnly() {
+        return true;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthRange.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthRange.java
new file mode 100644
index 0000000..2bdd02d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupLengthRange.java
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupLengthRange extends RecordGroup {
+    private final int min;
+    private final int max;
+    private final int groupSize;
+
+    public RecordGroupLengthRange(int noGroups, SimilarityFilters fuzzyFilters, String lengthstatsPath) {
+        super(noGroups, fuzzyFilters);
+        try {
+            DataInputStream in = new DataInputStream(new FileInputStream(lengthstatsPath.toString()));
+            min = in.readInt();
+            max = in.readInt();
+            groupSize = (int) Math.ceil((max - min + 1f) / noGroups);
+        } catch (IOException ioe) {
+            throw new RuntimeException(ioe);
+        }
+    }
+
+    @Override
+    public Iterable<Integer> getGroups(Integer token, Integer length) {
+        int lowAbs = fuzzyFilters.getLengthLowerBound(length);
+        int uppAbs = fuzzyFilters.getLengthUpperBound(length);
+
+        int low = Math.max(lowAbs - min, 0);
+        int upp = Math.min(uppAbs - min, max - min);
+
+        ArrayList<Integer> groups = new ArrayList<Integer>(upp - low + 1);
+        int prevGroup = -1;
+        for (int l = low; l <= upp; ++l) {
+            int group = l / groupSize;
+            if (group != prevGroup) {
+                groups.add(group);
+                prevGroup = group;
+            }
+        }
+
+        // System.out.println(length + " [" + lowAbs + "," + uppAbs + "] "
+        // + groups);
+
+        return groups;
+    }
+
+    @Override
+    public boolean isLengthOnly() {
+        return true;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupSingle.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupSingle.java
new file mode 100644
index 0000000..0f9480f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupSingle.java
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupSingle extends RecordGroup {
+
+    private final Collection<Integer> groups = new LinkedList<Integer>();
+
+    public RecordGroupSingle(int noGroups, SimilarityFilters fuzzyFilters) {
+        super(noGroups, fuzzyFilters);
+        groups.add(0);
+    }
+
+    @Override
+    public Iterable<Integer> getGroups(Integer token, Integer length) {
+        return groups;
+    }
+
+    @Override
+    public boolean isLengthOnly() {
+        return true;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequency.java
new file mode 100644
index 0000000..f0cd032
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequency.java
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupTokenFrequency extends RecordGroup {
+
+    private final Collection<Integer> groups = new LinkedList<Integer>();
+
+    public RecordGroupTokenFrequency(int noGroups, SimilarityFilters fuzzyFilters) {
+        super(noGroups, fuzzyFilters);
+    }
+
+    @Override
+    public Iterable<Integer> getGroups(Integer token, Integer length) {
+        groups.clear();
+        groups.add(token % noGroups);
+        return groups;
+    }
+
+    @Override
+    public boolean isLengthOnly() {
+        return false;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequencyMirror.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequencyMirror.java
new file mode 100644
index 0000000..36085a2
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenFrequencyMirror.java
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupTokenFrequencyMirror extends RecordGroup {
+
+    private final Collection<Integer> groups = new LinkedList<Integer>();
+
+    public RecordGroupTokenFrequencyMirror(int noGroups, SimilarityFilters fuzzyFilters) {
+        super(noGroups, fuzzyFilters);
+    }
+
+    @Override
+    public Iterable<Integer> getGroups(Integer token, Integer length) {
+        int noGroupsDouble = noGroups << 1;
+        int group = token % noGroupsDouble;
+        if (group >= noGroups) {
+            // mirror
+            group = -(group - noGroupsDouble) - 1;
+        }
+        groups.clear();
+        groups.add(group);
+        return groups;
+    }
+
+    @Override
+    public boolean isLengthOnly() {
+        return false;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenIdentity.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenIdentity.java
new file mode 100644
index 0000000..691fb10
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/recordgroup/RecordGroupTokenIdentity.java
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.recordgroup;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityFilters;
+
+public class RecordGroupTokenIdentity extends RecordGroup {
+
+    private final Collection<Integer> groups;
+
+    public RecordGroupTokenIdentity(int noGroups, SimilarityFilters fuzzyFilters) {
+        super(noGroups, fuzzyFilters);
+        groups = new LinkedList<Integer>();
+    }
+
+    @Override
+    public Iterable<Integer> getGroups(Integer token, Integer length) {
+        groups.clear();
+        groups.add(token);
+        return groups;
+    }
+
+    @Override
+    public boolean isLengthOnly() {
+        return false;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
new file mode 100644
index 0000000..85d1785
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public interface IGenericSimilarityMetric {
+    // returns similarity
+    public float getSimilarity(IListIterator firstList, IListIterator secondList);
+
+    // returns -1 if does not satisfy threshold
+    // else returns similarity
+    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IListIterator.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IListIterator.java
new file mode 100644
index 0000000..647c35f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/IListIterator.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public interface IListIterator {
+    public int compare(IListIterator cmpIter);
+
+    public byte[] getData();
+
+    public int getPos();
+
+    public boolean hasNext();
+
+    public void next();
+
+    public void reset();
+
+    public int size();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/PartialIntersect.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/PartialIntersect.java
new file mode 100644
index 0000000..15c339e
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/PartialIntersect.java
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public class PartialIntersect {
+    public int intersectSize;
+    public int posXStart;
+    public int posXStop;
+    public int posYStart;
+    public int posYStop;
+
+    private boolean startSet = false;
+
+    public boolean isSet() {
+        return startSet;
+    }
+
+    public void reset() {
+        startSet = false;
+    }
+
+    public void set() {
+        startSet = true;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFilters.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFilters.java
new file mode 100644
index 0000000..48b22ca
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFilters.java
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.io.Serializable;
+
+public interface SimilarityFilters extends Serializable {
+    public int getLengthLowerBound(int length);
+
+    public int getLengthUpperBound(int length);
+
+    public int getPrefixLength(int length);
+
+    public boolean passLengthFilter(int lengthX, int lengthY);
+
+    public boolean passPositionFilter(int noGramsCommon, int positionX, int lengthX, int positionY, int lengthY);
+
+    public float passSimilarityFilter(final int[] tokensX, int startX, int lengthX, final int prefixLengthX,
+            final int[] tokensY, int startY, int lengthY, final int prefixLengthY, final int intersectionSizePrefix);
+
+    public float passSimilarityFilter(final int[] tokensX, final int prefixLengthX, final int[] tokensY,
+            final int prefixLengthY, final int intersectionSizePrefix);
+
+    public boolean passSuffixFilter(int[] tokensX, int startX, int lengthX, int positionX, int[] tokensY, int startY,
+            int lengthY, int positionY);
+
+    public boolean passSuffixFilter(int[] tokensX, int positionX, int[] tokensY, int positionY);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersFactory.java
new file mode 100644
index 0000000..07c68a3
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersFactory.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public class SimilarityFiltersFactory {
+    public static SimilarityFilters getSimilarityFilters(String similarityName, float similarityThreshold) {
+        if ("jaccard".equalsIgnoreCase(similarityName)) {
+            return new SimilarityFiltersJaccard(similarityThreshold);
+        }
+        throw new RuntimeException("Unknown fuzzy filters \"" + similarityName + "\".");
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersJaccard.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersJaccard.java
new file mode 100644
index 0000000..db9fefa
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityFiltersJaccard.java
@@ -0,0 +1,297 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.util.Arrays;
+
+public class SimilarityFiltersJaccard implements SimilarityFilters {
+    class Partition {
+        public int startL;
+        public int lengthL;
+        public int startR;
+        public int lengthR;
+        public int hamming;
+
+        public Partition() {
+        }
+
+        public Partition(int startL, int lengthL, int startR, int lengthR, int hamming) {
+            this.startL = startL;
+            this.lengthL = lengthL;
+            this.startR = startR;
+            this.lengthR = lengthR;
+            this.hamming = hamming;
+        }
+    }
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    private static final int MAX_DEPTH = 2;
+
+    public static int getLengthLowerBound(int length, float simThr) {
+        return (int) Math.ceil(simThr * length);
+    }
+
+    public static boolean passLengthFilter(int lengthX, int lengthY, float simThr) {
+        return getLengthLowerBound(lengthX, simThr) <= lengthY && lengthY <= 1 / simThr * lengthX;
+    }
+
+    protected float simThr;
+
+    protected float simThr100;
+
+    public SimilarityFiltersJaccard(float similarityThreshold) {
+        reset(similarityThreshold);
+    }
+
+    public int getIndexPrefixLength(int length) {
+        return length - (int) Math.ceil(2 * simThr100 / (100 + simThr100) * length) + 1;
+    }
+
+    public int getIntersectLowerBound(int lengthX, int lengthY) {
+        return (int) Math.ceil(simThr100 * (lengthX + lengthY) / (100 + simThr100));
+    }
+
+    public int getIntersectUpperBound(int noGramsCommon, int positionX, int positionY, int lengthX, int lengthY) {
+        return noGramsCommon + Math.min(lengthX - positionX - 1, lengthY - positionY - 1);
+    }
+
+    public int getLengthLowerBound(int length) {
+        return getLengthLowerBound(length, simThr);
+    }
+
+    public int getLengthUpperBound(int length) {
+        return (int) Math.floor(1 / simThr * length);
+    }
+
+    private Partition getPartition(int[] tokens, int start, int length, int w, int posL, int posR) {
+        int p;
+        if (tokens[posL] > w) {
+            p = posL;
+        } else if (tokens[posR] < w) {
+            p = posR;
+        } else {
+            p = Arrays.binarySearch(tokens, start, start + length, w);
+        }
+
+        if (p < 0) {
+            p = -p - 1;
+        }
+
+        if (p >= start && p < start + length && tokens[p] == w) {
+            return new Partition(start, p - start, p + 1, start + length - p - 1, 0);
+        }
+        return new Partition(start, p - start, p, start + length - p, 1);
+    }
+
+    public int getPrefixLength(int length) {
+        if (length == 0) {
+            return 0;
+        }
+        return length - (int) Math.ceil(simThr * length) + 1;
+    }
+
+    public float getSimilarityThreshold() {
+        return simThr;
+    }
+
+    private int getSuffixFilter(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY,
+            int hammingMax, int depth) {
+        final int lengthDiff = Math.abs(lengthX - lengthY);
+
+        if (depth > MAX_DEPTH || lengthX == 0 || lengthY == 0) {
+            return lengthDiff;
+        }
+
+        final int mid = startY + lengthY / 2 + lengthY % 2 - 1;
+        final int offset = (hammingMax - lengthDiff) / 2;
+
+        int offsetL;
+        int offsetR;
+        if (lengthX < lengthY) {
+            offsetL = 1;
+            offsetR = 0;
+        } else {
+            offsetL = 0;
+            offsetR = 1;
+        }
+        Partition partitionY = new Partition(startY, mid - startY, mid + 1, startY + lengthY - mid - 1, 0);
+
+        // Partition partitionX = getPartition(tokensX, startX, lengthX,
+        // tokensY[mid], Math.max(Math.min(mid + startX - startY, startX
+        // + lengthX - 1)
+        // - offset - Math.abs(lengthX - lengthY) * offsetL,
+        // startX), Math.min(Math.max(mid + startX - startY,
+        // startX)
+        // + offset + Math.abs(lengthX - lengthY) * offsetR,
+        // startX + lengthX - 1));
+
+        Partition partitionX = getPartition(tokensX, startX, lengthX, tokensY[mid],
+                Math.max(mid + startX - startY - offset - lengthDiff * offsetL, startX),
+                Math.min(mid + startX - startY + offset + lengthDiff * offsetR, startX + lengthX - 1));
+
+        int hammingPart = partitionX.hamming;
+
+        int hamming = Math.abs(partitionX.lengthL - partitionY.lengthL)
+                + Math.abs(partitionX.lengthR - partitionY.lengthR) + hammingPart;
+
+        if (hamming <= hammingMax) {
+            int hammingL = getSuffixFilter(tokensX, partitionX.startL, partitionX.lengthL, tokensY, partitionY.startL,
+                    partitionY.lengthL, hammingMax - Math.abs(partitionX.lengthR - partitionY.lengthR) - hammingPart,
+                    depth + 1);
+            hamming = hammingL + Math.abs(partitionX.lengthR - partitionY.lengthR) + hammingPart;
+
+            if (hamming <= hammingMax) {
+                int hammingR = getSuffixFilter(tokensX, partitionX.startR, partitionX.lengthR, tokensY,
+                        partitionY.startR, partitionY.lengthR, hammingMax - hammingL - hammingPart, depth + 1);
+                hamming = hammingL + hammingR + hammingPart;
+            }
+        }
+        return hamming;
+    }
+
+    public boolean passLengthFilter(int lengthX, int lengthY) {
+        return passLengthFilter(lengthX, lengthY, simThr);
+    }
+
+    /**
+     * @param noGramsCommon
+     *            number of grams in common
+     * @param positionX
+     *            position of the last gram in common on X
+     * @param positionY
+     *            position of the last gram in common on X
+     * @param lengthX
+     * @param lengthY
+     * @return
+     */
+    public boolean passPositionFilter(int noGramsCommon, int positionX, int lengthX, int positionY, int lengthY) {
+        return getIntersectUpperBound(noGramsCommon, positionX, positionY, lengthX, lengthY) >= getIntersectLowerBound(
+                lengthX, lengthY);
+    }
+
+    public float passSimilarityFilter(final int[] tokensX, int startX, int lengthX, final int prefixLengthX,
+            final int[] tokensY, int startY, int lengthY, final int prefixLengthY, final int intersectionSizePrefix) {
+        final int length = lengthX;
+        final int token = tokensX[startX + Math.min(prefixLengthX, lengthX) - 1];
+        final int lengthProbe = lengthY;
+        final int tokenProbe = tokensY[startY + prefixLengthY - 1];
+
+        final int intersectSizeLowerBound = getIntersectLowerBound(length, lengthProbe);
+        int intersectSize = 0;
+
+        if (token < tokenProbe) {
+            if (intersectionSizePrefix + length - prefixLengthX >= intersectSizeLowerBound) {
+                intersectSize = intersectionSizePrefix
+                        + SimilarityMetric.getIntersectSize(tokensX, startX + prefixLengthX, lengthX - prefixLengthX,
+                                tokensY, startY + intersectionSizePrefix, lengthY - intersectionSizePrefix);
+            }
+        } else {
+            if (intersectionSizePrefix + lengthProbe - prefixLengthY >= intersectSizeLowerBound) {
+                intersectSize = intersectionSizePrefix
+                        + SimilarityMetric.getIntersectSize(tokensX, startX + intersectionSizePrefix, lengthX
+                                - intersectionSizePrefix, tokensY, startY + prefixLengthY, lengthY - prefixLengthY);
+            }
+        }
+
+        if (intersectSize >= intersectSizeLowerBound) {
+            return ((float) intersectSize) / (length + lengthProbe - intersectSize);
+        }
+        return 0;
+    }
+
+    /**
+     * @param tokensX
+     * @param prefixLengthX
+     * @param tokensY
+     * @param prefixLengthY
+     * @param intersectionSizePrefix
+     * @return similarity if it is above or equal to the similarity threshold, 0
+     *         otherwise
+     */
+    public float passSimilarityFilter(final int[] tokensX, final int prefixLengthX, final int[] tokensY,
+            final int prefixLengthY, final int intersectionSizePrefix) {
+        // final int length = tokensX.length;
+        // final int token = tokensX[Math.min(prefixLengthX, tokensX.length) -
+        // 1];
+        // final int lengthProbe = tokensY.length;
+        // final int tokenProbe = tokensY[prefixLengthY - 1];
+        //
+        // final int intersectSizeLowerBound = getIntersectLowerBound(length,
+        // lengthProbe);
+        // int intersectSize = 0;
+        //
+        // if (token < tokenProbe) {
+        // if (intersectionSizePrefix + length - prefixLengthX >=
+        // intersectSizeLowerBound) {
+        // intersectSize = intersectionSizePrefix
+        // + SimilarityMetric.getIntersectSize(tokensX,
+        // prefixLengthX, tokensY, intersectionSizePrefix);
+        // }
+        // } else {
+        // if (intersectionSizePrefix + lengthProbe - prefixLengthY >=
+        // intersectSizeLowerBound) {
+        // intersectSize = intersectionSizePrefix
+        // + SimilarityMetric.getIntersectSize(tokensX,
+        // intersectionSizePrefix, tokensY, prefixLengthY);
+        // }
+        // }
+        //
+        // if (intersectSize >= intersectSizeLowerBound) {
+        // return ((float) intersectSize)
+        // / (length + lengthProbe - intersectSize);
+        // }
+        // return 0;
+        return passSimilarityFilter(tokensX, 0, tokensX.length, prefixLengthX, tokensY, 0, tokensY.length,
+                prefixLengthY, intersectionSizePrefix);
+    }
+
+    public boolean passSuffixFilter(int[] tokensX, int tokensStartX, int tokensLengthX, int positionX, int[] tokensY,
+            int tokensStartY, int tokensLengthY, int positionY) {
+        int hammingMax = tokensLengthX + tokensLengthY - 2
+                * (int) Math.ceil(simThr100 / (100 + simThr100) * (tokensLengthX + tokensLengthY))
+                - (positionX + 1 + positionY + 1 - 2);
+        int hamming = getSuffixFilter(tokensX, tokensStartX + positionX + 1, tokensLengthX - positionX - 1, tokensY,
+                tokensStartY + positionY + 1, tokensLengthY - positionY - 1, hammingMax, 1);
+        return hamming <= hammingMax;
+    }
+
+    public boolean passSuffixFilter(int[] tokensX, int positionX, int[] tokensY, int positionY) {
+        // int hammingMax = tokensX.length
+        // + tokensY.length
+        // - 2
+        // * (int) Math.ceil(simThr100 / (100 + simThr100)
+        // * (tokensX.length + tokensY.length))
+        // - (positionX + 1 + positionY + 1 - 2);
+        // int hamming = getSuffixFilter(tokensX, positionX + 1, tokensX.length
+        // - positionX - 1, tokensY, positionY + 1, tokensY.length
+        // - positionY - 1, hammingMax, 1);
+        // return hamming <= hammingMax;
+        return passSuffixFilter(tokensX, 0, tokensX.length, positionX, tokensY, 0, tokensY.length, positionY);
+    }
+
+    public void reset(float similarityThreshold) {
+        simThr = similarityThreshold;
+        simThr100 = simThr * 100;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetric.java
new file mode 100644
index 0000000..415e785
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -0,0 +1,184 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+
+public abstract class SimilarityMetric {
+
+    public static int getIntersectSize(IListIterator tokensX, IListIterator tokensY) {
+        int intersectSize = 0;
+        while (tokensX.hasNext() && tokensY.hasNext()) {
+            int cmp = tokensX.compare(tokensY);
+            if (cmp > 0) {
+                tokensY.next();
+            } else if (cmp < 0) {
+                tokensX.next();
+            } else {
+                intersectSize++;
+                tokensX.next();
+                tokensY.next();
+            }
+        }
+        return intersectSize;
+    }
+
+    public static int getIntersectSize(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY) {
+        int posX = 0;
+        int posY = 0;
+        int intersectSize = 0;
+
+        while (posX < lengthX && posY < lengthY) {
+            int tokenX = tokensX[startX + posX];
+            int tokenY = tokensY[startY + posY];
+            if (tokenX > tokenY) {
+                posY++;
+            } else if (tokenX < tokenY) {
+                posX++;
+            } else {
+                intersectSize++;
+                posX++;
+                posY++;
+            }
+        }
+
+        return intersectSize;
+    }
+
+    public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
+        // int intersectSize = 0;
+        //
+        // while (startX < tokensX.length && startY < tokensY.length) {
+        // int tokenX = tokensX[startX];
+        // int tokenY = tokensY[startY];
+        // if (tokenX > tokenY) {
+        // startY++;
+        // } else if (tokenX < tokenY) {
+        // startX++;
+        // } else {
+        // intersectSize++;
+        // startX++;
+        // startY++;
+        // }
+        // }
+        //
+        // return intersectSize;
+        return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
+    }
+
+    public static int getIntersectSize(int[] tokensX, int[] tokensY) {
+        return getIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length);
+    }
+
+    public static PartialIntersect getPartialIntersectSize(int[] tokensX, int startX, int lengthX, int[] tokensY,
+            int startY, int lengthY, int tokenStop) {
+        PartialIntersect parInter = new PartialIntersect();
+        getPartialIntersectSize(tokensX, startX, lengthX, tokensY, startY, lengthY, tokenStop, parInter);
+        return parInter;
+    }
+
+    public static void getPartialIntersectSize(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY,
+            int lengthY, int tokenStop, PartialIntersect parInter) {
+        int posX = 0;
+        int posY = 0;
+        int intersectSize = 0;
+
+        parInter.reset();
+        while (posX < lengthX && posY < lengthY) {
+            int tokenX = tokensX[startX + posX];
+            int tokenY = tokensY[startY + posY];
+            if (tokenX > tokenY) {
+                posY++;
+            } else if (tokenX < tokenY) {
+                posX++;
+            } else {
+                intersectSize++;
+                if (!parInter.isSet()) {
+                    parInter.posXStart = posX;
+                    parInter.posYStart = posY;
+                    parInter.set();
+                }
+                if (tokenX == tokenStop) {
+                    parInter.posXStop = posX;
+                    parInter.posYStop = posY;
+                    parInter.intersectSize = intersectSize;
+                }
+                posX++;
+                posY++;
+            }
+        }
+    }
+
+    public static PartialIntersect getPartialIntersectSize(int[] tokensX, int[] tokensY, int tokenStop) {
+        return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
+    }
+
+    // @SuppressWarnings("unchecked")
+    // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
+    // int intersectSize = 0;
+    //
+    // Iterator<Tuple> iteratorX = tokensX.iterator();
+    // Iterator<Tuple> iteratorY = tokensY.iterator();
+    //
+    // Tuple nextX = null;
+    // Tuple nextY = null;
+    //
+    // while ((nextX != null || iteratorX.hasNext())
+    // && (nextY != null || iteratorY.hasNext())) {
+    // if (nextX == null) {
+    // nextX = iteratorX.next();
+    // }
+    // if (nextY == null) {
+    // nextY = iteratorY.next();
+    // }
+    //
+    // int cmp = nextX.compareTo(nextY);
+    // if (cmp > 0) {
+    // nextY = null;
+    // } else if (cmp < 0) {
+    // nextX = null;
+    // } else {
+    // intersectSize++;
+    // nextX = null;
+    // nextY = null;
+    // }
+    // }
+    //
+    // return intersectSize;
+    // }
+
+    // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
+
+    // public abstract float getSimilarity(DataBag tokensX, int lengthX,
+    // DataBag tokensY, int lengthY);
+
+    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) {
+        int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
+        int totalSize = tokensX.size() + tokensY.size();
+
+        return (float) intersectionSize / (totalSize - intersectionSize);
+    }
+
+    public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
+
+    public abstract float getSimilarity(int[] tokensX, int[] tokensY);
+
+    public abstract float getSimilarity(String stringX, String stringY, Tokenizer tokenizer);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
new file mode 100644
index 0000000..b99d6f7
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.util.Arrays;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.StringUtils;
+
+public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
+
+    private final int utf8SizeIndicatorSize = 2;
+
+    // dp implementation only needs 2 rows
+    private final int rows = 2;
+    private int cols;
+    private int[][] matrix;
+
+    // for letter count filtering
+    private final int[] fsLcCount = new int[128];
+    private final int[] ssLcCount = new int[128];
+
+    public SimilarityMetricEditDistance() {
+        cols = 100; // arbitrary default value
+        matrix = new int[rows][cols];
+    }
+
+    @Override
+    public float getSimilarity(IListIterator firstList, IListIterator secondList) {
+        int flLen = firstList.size();
+        int slLen = secondList.size();
+
+        // reuse existing matrix if possible
+        if (slLen >= cols) {
+            cols = slLen + 1;
+            matrix = new int[rows][cols];
+        }
+
+        // init matrix
+        for (int i = 0; i <= slLen; i++) {
+            matrix[0][i] = i;
+        }
+
+        int currRow = 1;
+        int prevRow = 0;
+
+        // expand dynamic programming matrix row by row
+        for (int i = 1; i <= flLen; i++) {
+            matrix[currRow][0] = i;
+
+            secondList.reset();
+            for (int j = 1; j <= slLen; j++) {
+
+                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+                        matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
+
+                secondList.next();
+            }
+
+            firstList.next();
+
+            int tmp = currRow;
+            currRow = prevRow;
+            prevRow = tmp;
+        }
+
+        return matrix[prevRow][slLen];
+    }
+
+    @Override
+    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh) {
+
+        int edThresh = (int) simThresh;
+
+        int flLen = firstList.size();
+        int slLen = secondList.size();
+
+        // length filter
+        if (Math.abs(flLen - slLen) > edThresh) {
+            return -1;
+        }
+
+        float ed = getSimilarity(firstList, secondList);
+        if (ed > edThresh) {
+            return -1;
+        } else {
+            return ed;
+        }
+    }
+
+    // faster implementation for common case of string edit distance
+    public int UTF8StringEditDistance(byte[] bytes, int fsStart, int ssStart) {
+
+        int fsLen = StringUtils.getStrLen(bytes, fsStart);
+        int ssLen = StringUtils.getStrLen(bytes, ssStart);
+
+        // reuse existing matrix if possible
+        if (ssLen >= cols) {
+            cols = ssLen + 1;
+            matrix = new int[rows][cols];
+        }
+
+        int fsDataStart = fsStart + utf8SizeIndicatorSize;
+        int ssDataStart = ssStart + utf8SizeIndicatorSize;
+
+        // init matrix
+        for (int i = 0; i <= ssLen; i++) {
+            matrix[0][i] = i;
+        }
+
+        int currRow = 1;
+        int prevRow = 0;
+
+        // expand dynamic programming matrix row by row
+        int fsPos = fsDataStart;
+        for (int i = 1; i <= fsLen; i++) {
+            matrix[currRow][0] = i;
+            char fsChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, fsPos));
+
+            int ssPos = ssDataStart;
+            for (int j = 1; j <= ssLen; j++) {
+                char ssChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, ssPos));
+
+                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+                        matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
+
+                ssPos += StringUtils.charSize(bytes, ssPos);
+            }
+
+            fsPos += StringUtils.charSize(bytes, fsPos);
+
+            int tmp = currRow;
+            currRow = prevRow;
+            prevRow = tmp;
+        }
+
+        return matrix[prevRow][ssLen];
+    }
+
+    public int UTF8StringEditDistance(byte[] bytes, int fsStart, int ssStart, int edThresh) {
+
+        int fsUtfLen = StringUtils.getUTFLen(bytes, fsStart);
+        int ssUtfLen = StringUtils.getUTFLen(bytes, ssStart);
+
+        // length filter
+        if (Math.abs(fsUtfLen - ssUtfLen) > edThresh) {
+            return -1;
+        }
+
+        // initialize letter count filtering
+        Arrays.fill(fsLcCount, 0);
+        Arrays.fill(ssLcCount, 0);
+
+        // compute letter counts for first string
+        int fsPos = fsStart + utf8SizeIndicatorSize;
+        int fsEnd = fsPos + fsUtfLen;
+        while (fsPos < fsEnd) {
+            char c = StringUtils.toLowerCase(StringUtils.charAt(bytes, fsPos));
+            if (c < 128) {
+                fsLcCount[c]++;
+            }
+            fsPos += StringUtils.charSize(bytes, fsPos);
+        }
+
+        // compute letter counts for second string
+        int ssPos = ssStart + utf8SizeIndicatorSize;
+        int ssEnd = ssPos + ssUtfLen;
+        while (ssPos < ssEnd) {
+            char c = StringUtils.toLowerCase(StringUtils.charAt(bytes, ssPos));
+            if (c < 128) {
+                ssLcCount[c]++;
+            }
+            ssPos += StringUtils.charSize(bytes, ssPos);
+        }
+
+        // apply filter
+        int gtSum = 0;
+        int ltSum = 0;
+        for (int i = 0; i < 128; i++) {
+            if (fsLcCount[i] > ssLcCount[i]) {
+                gtSum += fsLcCount[i] - ssLcCount[i];
+                if (gtSum > edThresh) {
+                    return -1;
+                }
+            } else {
+                ltSum += ssLcCount[i] - fsLcCount[i];
+                if (ltSum > edThresh) {
+                    return -1;
+                }
+            }
+        }
+
+        int ed = UTF8StringEditDistance(bytes, fsStart, ssStart);
+        if (ed > edThresh) {
+            return -1;
+        } else {
+            return ed;
+        }
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricFactory.java
new file mode 100644
index 0000000..a66328b
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricFactory.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+public class SimilarityMetricFactory {
+    public static SimilarityMetric getSimilarityMetric(String similarityMetric) {
+        if (similarityMetric.equals("Jaccard")) {
+            return new SimilarityMetricJaccard();
+        }
+        throw new RuntimeException("Unknown fuzzy metric \"" + similarityMetric + "\".");
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
new file mode 100644
index 0000000..b0c0638
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -0,0 +1,118 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.similarity;
+
+import java.util.Set;
+import java.util.TreeSet;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Tokenizer;
+
+public class SimilarityMetricJaccard extends SimilarityMetric implements IGenericSimilarityMetric {
+
+    public static float getSimilarity(int intersectSize, int lengthX, int lengthY) {
+        return ((float) intersectSize) / (lengthX + lengthY - intersectSize);
+    }
+
+    public static float getSimilarityBag(int[] tokensX, int[] tokensY) {
+        Set<Integer> setX = new TreeSet<Integer>();
+        for (int token : tokensX) {
+            setX.add(token);
+        }
+        Set<Integer> setY = new TreeSet<Integer>();
+        for (int token : tokensY) {
+            setY.add(token);
+        }
+        setX.retainAll(setY);
+        return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
+    }
+
+    // @Override
+    // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
+    // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
+    // (int) tokensY.size());
+    // }
+
+    // @Override
+    // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
+    // int lengthY) {
+    // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
+    // tokensY);
+    // int totalSize = lengthX + lengthY;
+    //
+    // return (float) intersectionSize / (totalSize - intersectionSize);
+    // }
+
+    @Override
+    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) {
+        int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
+        int totalSize = tokensX.size() + tokensY.size();
+
+        return (float) intersectionSize / (totalSize - intersectionSize);
+    }
+
+    @Override
+    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh) {
+
+        // apply length filter
+        int lengthLowerBound = (int) Math.ceil(simThresh * firstList.size());
+
+        boolean passesLengthFilter = (lengthLowerBound <= secondList.size())
+                && (secondList.size() <= 1.0f / simThresh * firstList.size());
+        if (!passesLengthFilter) {
+            return -1f;
+        }
+
+        float jacc = getSimilarity(firstList, secondList);
+        if (jacc < simThresh) {
+            return -1f;
+        } else {
+            return jacc;
+        }
+    }
+
+    @Override
+    public float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY) {
+        int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, startX, lengthX, tokensY, startY, lengthY);
+        int totalSize = lengthX + lengthY;
+
+        return (float) intersectionSize / (totalSize - intersectionSize);
+    }
+
+    @Override
+    public float getSimilarity(int[] tokensX, int[] tokensY) {
+        return getSimilarity(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length);
+    }
+
+    @Override
+    public float getSimilarity(String stringX, String stringY, Tokenizer tokenizer) {
+        Set<String> setX = new TreeSet<String>();
+        for (String token : tokenizer.tokenize(stringX)) {
+            setX.add(token);
+        }
+        Set<String> setY = new TreeSet<String>();
+        for (String token : tokenizer.tokenize(stringY)) {
+            setY.add(token);
+        }
+        int lengthX = setX.size();
+        int lengthY = setY.size();
+        setX.retainAll(setY);
+        return ((float) setX.size()) / (lengthX + lengthY - setX.size());
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..0e3c28f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8StringBinaryTokenizer.java
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import edu.uci.ics.asterix.fuzzyjoin.IntArray;
+
+public abstract class AbstractUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+
+    protected byte[] data;
+    protected int start;
+    protected int length;
+    protected int tokenLength;
+    protected int index;
+    protected int utf8Length;
+
+    protected final IntArray tokensStart;
+    protected final IntArray tokensLength;
+    protected final IToken token;
+
+    protected final boolean ignoreTokenCount;
+    protected final boolean sourceHasTypeTag;
+
+    public AbstractUTF8StringBinaryTokenizer(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+            ITokenFactory tokenFactory) {
+        this.ignoreTokenCount = ignoreTokenCount;
+        this.sourceHasTypeTag = sourceHasTypeTag;
+        if (!ignoreTokenCount) {
+            tokensStart = new IntArray();
+            tokensLength = new IntArray();
+        } else {
+            tokensStart = null;
+            tokensLength = null;
+        }
+        token = tokenFactory.createToken();
+    }
+
+    @Override
+    public IToken getToken() {
+        return token;
+    }
+
+    @Override
+    public void reset(byte[] data, int start, int length) {
+        this.start = start;
+        index = this.start;
+        if (sourceHasTypeTag) {
+            index++; // skip type tag
+        }
+        utf8Length = StringUtils.getUTFLen(data, index);
+        index += 2; // skip utf8 length indicator
+        this.data = data;
+        this.length = length + start;
+
+        tokenLength = 0;
+        if (!ignoreTokenCount) {
+            tokensStart.reset();
+            tokensLength.reset();
+        }
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8Token.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8Token.java
new file mode 100644
index 0000000..d478f36
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8Token.java
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public abstract class AbstractUTF8Token implements IToken {
+    public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+
+    protected int length;
+    protected int tokenLength;
+    protected int start;
+    protected int tokenCount;
+    protected byte[] data;
+    protected final byte tokenTypeTag;
+    protected final byte countTypeTag;
+
+    public AbstractUTF8Token() {
+        tokenTypeTag = -1;
+        countTypeTag = -1;
+    }
+
+    public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+        this.tokenTypeTag = tokenTypeTag;
+        this.countTypeTag = countTypeTag;
+    }
+
+    @Override
+    public byte[] getData() {
+        return data;
+    }
+
+    @Override
+    public int getLength() {
+        return length;
+    }
+
+    public int getLowerCaseUTF8Len(int size) {
+        int lowerCaseUTF8Len = 0;
+        int pos = start;
+        for (int i = 0; i < size; i++) {
+            char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+            lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+            pos += StringUtils.charSize(data, pos);
+        }
+        return lowerCaseUTF8Len;
+    }
+
+    @Override
+    public int getStart() {
+        return start;
+    }
+
+    @Override
+    public int getTokenLength() {
+        return tokenLength;
+    }
+
+    public void handleCountTypeTag(DataOutput dos) throws IOException {
+        if (countTypeTag > 0) {
+            dos.write(countTypeTag);
+        }
+    }
+
+    public void handleTokenTypeTag(DataOutput dos) throws IOException {
+        if (tokenTypeTag > 0) {
+            dos.write(tokenTypeTag);
+        }
+    }
+
+    @Override
+    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+        this.data = data;
+        this.start = start;
+        this.length = length;
+        this.tokenLength = tokenLength;
+        this.tokenCount = tokenCount;
+    }
+
+    @Override
+    public void serializeTokenCount(DataOutput dos) throws IOException {
+        handleCountTypeTag(dos);
+        dos.writeInt(tokenCount);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8TokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8TokenFactory.java
new file mode 100644
index 0000000..5ade0dc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/AbstractUTF8TokenFactory.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public abstract class AbstractUTF8TokenFactory implements ITokenFactory {
+    private static final long serialVersionUID = 1L;
+    protected final byte tokenTypeTag;
+    protected final byte countTypeTag;
+
+    public AbstractUTF8TokenFactory() {
+        tokenTypeTag = -1;
+        countTypeTag = -1;
+    }
+
+    public AbstractUTF8TokenFactory(byte tokenTypeTag, byte countTypeTag) {
+        this.tokenTypeTag = tokenTypeTag;
+        this.countTypeTag = countTypeTag;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..5a12c00
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizer.java
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class DelimitedUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
+
+    public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+            ITokenFactory tokenFactory) {
+        super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+    }
+
+    @Override
+    public boolean hasNext() {
+        // skip delimiters
+        while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+            index += StringUtils.charSize(data, index);
+        }
+        return index < length;
+    }
+
+    private boolean isSeparator(char c) {
+        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+    }
+
+    @Override
+    public void next() {
+        tokenLength = 0;
+        int currentTokenStart = index;
+        while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+            index += StringUtils.charSize(data, index);
+            tokenLength++;
+        }
+        int tokenCount = 1;
+        if (tokenLength > 0 && !ignoreTokenCount) {
+            // search if we got the same token before
+            for (int i = 0; i < tokensStart.length(); ++i) {
+                if (tokenLength == tokensLength.get(i)) {
+                    int tokenStart = tokensStart.get(i);
+                    tokenCount++; // assume we found it
+                    int offset = 0;
+                    int currLength = 0;
+                    while (currLength < tokenLength) {
+                        // case insensitive comparison
+                        if (StringUtils.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != StringUtils
+                                .toLowerCase(StringUtils.charAt(data, tokenStart + offset))) {
+                            tokenCount--;
+                            break;
+                        }
+                        offset += StringUtils.charSize(data, currentTokenStart + offset);
+                        currLength++;
+                    }
+                }
+            }
+            // add the new token to the list of seen tokens
+            tokensStart.add(currentTokenStart);
+            tokensLength.add(tokenLength);
+        }
+
+        // set token
+        token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..cb002df
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+
+    private static final long serialVersionUID = 1L;
+    private final boolean ignoreTokenCount;
+    private final boolean sourceHasTypeTag;
+    private final ITokenFactory tokenFactory;
+
+    public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+            ITokenFactory tokenFactory) {
+        this.ignoreTokenCount = ignoreTokenCount;
+        this.sourceHasTypeTag = sourceHasTypeTag;
+        this.tokenFactory = tokenFactory;
+    }
+
+    @Override
+    public IBinaryTokenizer createTokenizer() {
+        return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramToken.java
new file mode 100644
index 0000000..461638f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramToken.java
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class HashedUTF8NGramToken extends UTF8NGramToken {
+    public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        handleTokenTypeTag(dos);
+
+        int hash = GOLDEN_RATIO_32;
+
+        // pre chars
+        for (int i = 0; i < numPreChars; i++) {
+            hash ^= PRECHAR;
+            hash *= GOLDEN_RATIO_32;
+        }
+
+        // regular chars
+        int numRegGrams = tokenLength - numPreChars - numPostChars;
+        int pos = start;
+        for (int i = 0; i < numRegGrams; i++) {
+            hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+            hash *= GOLDEN_RATIO_32;
+            pos += StringUtils.charSize(data, pos);
+        }
+
+        // post chars
+        for (int i = 0; i < numPostChars; i++) {
+            hash ^= POSTCHAR;
+            hash *= GOLDEN_RATIO_32;
+        }
+
+        // token count
+        hash += tokenCount;
+
+        dos.writeInt(hash);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
new file mode 100644
index 0000000..9b6f835
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+    private static final long serialVersionUID = 1L;
+
+    public HashedUTF8NGramTokenFactory() {
+        super();
+    }
+
+    public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public IToken createToken() {
+        return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
new file mode 100644
index 0000000..c7d5f3a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class HashedUTF8WordToken extends UTF8WordToken {
+
+    private int hash = 0;
+
+    public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (o == null) {
+            return false;
+        }
+        if (!(o instanceof IToken)) {
+            return false;
+        }
+        IToken t = (IToken) o;
+        if (t.getTokenLength() != tokenLength) {
+            return false;
+        }
+        int offset = 0;
+        for (int i = 0; i < tokenLength; i++) {
+            if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils.charAt(data, start + offset)) {
+                return false;
+            }
+            offset += StringUtils.charSize(data, start + offset);
+        }
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        return hash;
+    }
+
+    @Override
+    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+        super.reset(data, start, length, tokenLength, tokenCount);
+
+        // pre-compute hash value using JAQL-like string hashing
+        int pos = start;
+        hash = GOLDEN_RATIO_32;
+        for (int i = 0; i < tokenLength; i++) {
+            hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+            hash *= GOLDEN_RATIO_32;
+            pos += StringUtils.charSize(data, pos);
+        }
+        hash += tokenCount;
+    }
+
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        if (tokenTypeTag > 0) {
+            dos.write(tokenTypeTag);
+        }
+
+        // serialize hash value
+        dos.writeInt(hash);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
new file mode 100644
index 0000000..aec26bc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+    private static final long serialVersionUID = 1L;
+
+    public HashedUTF8WordTokenFactory() {
+        super();
+    }
+
+    public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public IToken createToken() {
+        return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
new file mode 100644
index 0000000..2403e2f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public interface IBinaryTokenizer {
+    public IToken getToken();
+
+    public boolean hasNext();
+
+    public void next();
+
+    public void reset(byte[] data, int start, int length);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..eecd98c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+    public IBinaryTokenizer createTokenizer();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/INGramToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/INGramToken.java
new file mode 100644
index 0000000..ae55837
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/INGramToken.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public interface INGramToken {
+    public int getNumPostChars();
+
+    public int getNumPreChars();
+
+    public void setNumPrePostChars(int numPreChars, int numPostChars);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IToken.java
new file mode 100644
index 0000000..aa6dc86
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/IToken.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public interface IToken {
+    public byte[] getData();
+
+    public int getLength();
+
+    public int getStart();
+
+    public int getTokenLength();
+
+    public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount);
+
+    public void serializeToken(DataOutput dos) throws IOException;
+
+    public void serializeTokenCount(DataOutput dos) throws IOException;
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/ITokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
new file mode 100644
index 0000000..e5a95a1
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+
+public interface ITokenFactory extends Serializable {
+    public IToken createToken();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
new file mode 100644
index 0000000..e909a24
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+public class NGramTokenizer implements Tokenizer {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    public static void main(String args[]) {
+        Tokenizer tokenizer = new NGramTokenizer();
+        String a = "hadoopoop";
+        System.out.println(a + ":" + tokenizer.tokenize(a));
+    }
+
+    private final int gramLength;
+
+    /**
+     * padding used in q gram calculation.
+     */
+    private final char QGRAMENDPADDING = '$';
+
+    /**
+     * padding used in q gram calculation.
+     */
+    private final char QGRAMSTARTPADDING = '$';
+
+    public NGramTokenizer() {
+        gramLength = 3;
+    }
+
+    public NGramTokenizer(int gramLength) {
+        this.gramLength = gramLength;
+    }
+
+    private StringBuffer getAdjustedString(String input) {
+        final StringBuffer adjustedString = new StringBuffer();
+        for (int i = 0; i < gramLength - 1; i++) {
+            adjustedString.append(QGRAMSTARTPADDING);
+        }
+        adjustedString.append(input);
+        for (int i = 0; i < gramLength - 1; i++) {
+            adjustedString.append(QGRAMENDPADDING);
+        }
+        return adjustedString;
+    }
+
+    public List<String> tokenize(String input) {
+        final ArrayList<String> returnVect = new ArrayList<String>();
+        final StringBuffer adjustedString = getAdjustedString(input);
+        int curPos = 0;
+        final int length = adjustedString.length() - (gramLength - 1);
+        final HashMap<String, Integer> grams = new HashMap<String, Integer>();
+        while (curPos < length) {
+            final String term = adjustedString.substring(curPos, curPos + gramLength);
+            Integer count = grams.get(term);
+            if (count == null) {
+                count = new Integer(0);
+            }
+            count++;
+            grams.put(term, count);
+            returnVect.add(term + count);
+            curPos++;
+        }
+        return returnVect;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..362126f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
@@ -0,0 +1,116 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
+
+    private int gramLength;
+    private boolean usePrePost;
+
+    private int gramNum;
+    private int totalGrams;
+
+    private final INGramToken concreteToken;
+
+    public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
+            boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+        super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+        this.gramLength = gramLength;
+        this.usePrePost = usePrePost;
+        concreteToken = (INGramToken) token;
+    }
+
+    @Override
+    public boolean hasNext() {
+        if (gramNum < totalGrams) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    @Override
+    public void next() {
+        int currentTokenStart = index;
+        int tokenCount = 1;
+        int numPreChars = 0;
+        int numPostChars = 0;
+        if (usePrePost) {
+            numPreChars = Math.max(gramLength - gramNum - 1, 0);
+            numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0;
+        }
+        gramNum++;
+
+        concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+        if (numPreChars == 0) {
+            index += StringUtils.charSize(data, index);
+        }
+
+        // compute token count
+        // ignore pre and post grams for duplicate detection
+        if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+            int tmpIndex = start;
+            while (tmpIndex < currentTokenStart) {
+                tokenCount++; // assume found
+                int offset = 0;
+                for (int j = 0; j < gramLength; j++) {
+                    if (StringUtils.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != StringUtils
+                            .toLowerCase(StringUtils.charAt(data, tmpIndex + offset))) {
+                        tokenCount--;
+                        break;
+                    }
+                    offset += StringUtils.charSize(data, tmpIndex + offset);
+                }
+                tmpIndex += StringUtils.charSize(data, tmpIndex);
+            }
+        }
+
+        // set token
+        token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+    }
+
+    @Override
+    public void reset(byte[] data, int start, int length) {
+        super.reset(data, start, length);
+        gramNum = 0;
+
+        int numChars = 0;
+        int pos = index;
+        int end = pos + utf8Length;
+        while (pos < end) {
+            numChars++;
+            pos += StringUtils.charSize(data, pos);
+        }
+
+        if (usePrePost) {
+            totalGrams = numChars + gramLength - 1;
+        } else {
+            totalGrams = numChars - gramLength + 1;
+        }
+    }
+
+    public void setGramlength(int gramLength) {
+        this.gramLength = gramLength;
+    }
+
+    public void setPrePost(boolean usePrePost) {
+        this.usePrePost = usePrePost;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/StringUtils.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/StringUtils.java
new file mode 100644
index 0000000..48d61e7
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/StringUtils.java
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class StringUtils {
+    public static char charAt(byte[] b, int s) {
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return (char) c;
+
+            case 12:
+            case 13:
+                return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+
+            case 14:
+                return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
+
+            default:
+                throw new IllegalArgumentException();
+        }
+    }
+
+    public static int charSize(byte[] b, int s) {
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return 1;
+
+            case 12:
+            case 13:
+                return 2;
+
+            case 14:
+                return 3;
+        }
+        throw new IllegalStateException();
+    }
+
+    public static int getModifiedUTF8Len(char c) {
+        if (c >= 0x0000 && c <= 0x007F) {
+            return 1;
+        } else if (c <= 0x07FF) {
+            return 2;
+        } else {
+            return 3;
+        }
+    }
+
+    public static int getStrLen(byte[] b, int s) {
+        int pos = s + 2;
+        int end = pos + getUTFLen(b, s);
+        int charCount = 0;
+        while (pos < end) {
+            charCount++;
+            pos += charSize(b, pos);
+        }
+        return charCount;
+    }
+
+    public static int getUTFLen(byte[] b, int s) {
+        return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
+    }
+
+    public static char toLowerCase(char c) {
+        switch (c) {
+            case 'A':
+                return 'a';
+            case 'B':
+                return 'b';
+            case 'C':
+                return 'c';
+            case 'D':
+                return 'd';
+            case 'E':
+                return 'e';
+            case 'F':
+                return 'f';
+            case 'G':
+                return 'g';
+            case 'H':
+                return 'h';
+            case 'I':
+                return 'i';
+            case 'J':
+                return 'j';
+            case 'K':
+                return 'k';
+            case 'L':
+                return 'l';
+            case 'M':
+                return 'm';
+            case 'N':
+                return 'n';
+            case 'O':
+                return 'o';
+            case 'P':
+                return 'p';
+            case 'Q':
+                return 'q';
+            case 'R':
+                return 'r';
+            case 'S':
+                return 's';
+            case 'T':
+                return 't';
+            case 'U':
+                return 'u';
+            case 'V':
+                return 'v';
+            case 'W':
+                return 'w';
+            case 'X':
+                return 'x';
+            case 'Y':
+                return 'y';
+            case 'Z':
+                return 'z';
+            case 'Ä':
+                return 'ä';
+            case 'Ǟ':
+                return 'ǟ';
+            case 'Ë':
+                return 'ë';
+            case 'Ḧ':
+                return 'ḧ';
+            case 'Ï':
+                return 'ï';
+            case 'Ḯ':
+                return 'ḯ';
+            case 'Ö':
+                return 'ö';
+            case 'Ȫ':
+                return 'ȫ';
+            case 'Ṏ':
+                return 'ṏ';
+            case 'Ü':
+                return 'ü';
+            case 'Ǖ':
+                return 'ǖ';
+            case 'Ǘ':
+                return 'ǘ';
+            case 'Ǚ':
+                return 'ǚ';
+            case 'Ǜ':
+                return 'ǜ';
+            case 'Ṳ':
+                return 'ṳ';
+            case 'Ṻ':
+                return 'ṻ';
+            case 'Ẅ':
+                return 'ẅ';
+            case 'Ẍ':
+                return 'ẍ';
+            case 'Ÿ':
+                return 'ÿ';
+            default:
+                // since I probably missed some chars above
+                // use Java to convert to lower case to be safe
+                return Character.toLowerCase(c);
+        }
+    }
+
+    public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
+
+        if (c >= 0x0000 && c <= 0x007F) {
+            dos.writeByte(c);
+        } else if (c <= 0x07FF) {
+            dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+        } else {
+            dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+            dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+        }
+    }
+
+    public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
+        dos.write((len >>> 8) & 0xFF);
+        dos.write((len >>> 0) & 0xFF);
+    }
+}
\ No newline at end of file
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Token.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Token.java
new file mode 100644
index 0000000..6c6d365
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Token.java
@@ -0,0 +1,118 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+
+public class Token implements Serializable {
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    private CharSequence data;
+    private int start;
+    private int length;
+    private int count;
+
+    /** Cache the hash code for the string */
+    private int hash; // Default to 0
+
+    public Token() {
+    }
+
+    public Token(CharSequence data, int start, int length, int count) {
+        set(data, start, length, count);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (o == null) {
+            return false;
+        }
+        if (!(o instanceof Token)) {
+            return false;
+        }
+        Token t = (Token) o;
+        if (t.length != length) {
+            return false;
+        }
+        for (int i = 0; i < length; i++) {
+            if (t.data.charAt(t.start + i) != data.charAt(start + i)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    public CharSequence getCharSequence() {
+        return data;
+    }
+
+    public int getCount() {
+        return count;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    public int getStart() {
+        return start;
+    }
+
+    @Override
+    public int hashCode() {
+        int h = hash;
+        if (h == 0 && length > 0) {
+            for (int i = 0; i < length; i++) {
+                h = 31 * h + data.charAt(start + i);
+            }
+            h = 31 * h + count;
+            hash = h;
+        }
+        return h;
+    }
+
+    public int length() {
+        return length;
+    }
+
+    public void set(CharSequence data, int start, int length, int count) {
+        this.data = data;
+        this.start = start;
+        this.length = length;
+        this.count = count;
+        hash = 0;
+    }
+
+    public void set(String data, int count) {
+        this.data = data;
+        start = 0;
+        length = data.length();
+        this.count = count;
+        hash = 0;
+    }
+
+    @Override
+    public String toString() {
+        return "(" + data.subSequence(start, start + length) + ", " + count + ")";
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Tokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Tokenizer.java
new file mode 100644
index 0000000..312c626
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/Tokenizer.java
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.Serializable;
+import java.util.List;
+
+public interface Tokenizer extends Serializable {
+    public List<String> tokenize(String text);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
new file mode 100644
index 0000000..76dc588
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public interface TokenizerBuffered {
+    public void advance();
+
+    public boolean end();
+
+    public Token getToken();
+
+    public void reset();
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
new file mode 100644
index 0000000..8be793c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class TokenizerBufferedFactory {
+    public static TokenizerBuffered getTokenizer(String tokenizer, StringBuilder buffer) {
+        if (tokenizer.equals("Word")) {
+            return new WordTokenizerBuffered(buffer);
+        }
+        throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
+    }
+
+    public static boolean isSeparator(char c) {
+        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
new file mode 100644
index 0000000..dad43e9
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class TokenizerFactory {
+    public static Tokenizer getTokenizer(String tokenizer, String wordSeparator, char tokenSeparator) {
+        if (tokenizer.equals("NGram")) {
+            return new NGramTokenizer();
+        } else if (tokenizer.equals("Word")) {
+            return new WordTokenizer(wordSeparator, tokenSeparator);
+        }
+        throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
new file mode 100644
index 0000000..07a5e0d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
+
+    public final static char PRECHAR = '#';
+
+    public final static char POSTCHAR = '$';
+
+    protected int numPreChars;
+    protected int numPostChars;
+
+    public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public int getNumPostChars() {
+        return numPreChars;
+    }
+
+    @Override
+    public int getNumPreChars() {
+        return numPostChars;
+    }
+
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        handleTokenTypeTag(dos);
+
+        // regular chars
+        int numRegChars = tokenLength - numPreChars - numPostChars;
+
+        // assuming pre and post char need 1-byte each in utf8
+        int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars;
+
+        // write utf8 length indicator
+        StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+
+        // pre chars
+        for (int i = 0; i < numPreChars; i++) {
+            StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+        }
+
+        int pos = start;
+        for (int i = 0; i < numRegChars; i++) {
+            char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+            StringUtils.writeCharAsModifiedUTF8(c, dos);
+            pos += StringUtils.charSize(data, pos);
+        }
+
+        // post chars
+        for (int i = 0; i < numPostChars; i++) {
+            StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+        }
+    }
+
+    public void setNumPrePostChars(int numPreChars, int numPostChars) {
+        this.numPreChars = numPreChars;
+        this.numPostChars = numPostChars;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
new file mode 100644
index 0000000..4417b9c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+    private static final long serialVersionUID = 1L;
+
+    public UTF8NGramTokenFactory() {
+        super();
+    }
+
+    public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public IToken createToken() {
+        return new UTF8NGramToken(tokenTypeTag, countTypeTag);
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
new file mode 100644
index 0000000..aacbfd8
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class UTF8WordToken extends AbstractUTF8Token {
+
+    public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public void serializeToken(DataOutput dos) throws IOException {
+        handleTokenTypeTag(dos);
+
+        int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+        StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+        int pos = start;
+        for (int i = 0; i < tokenLength; i++) {
+            char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+            StringUtils.writeCharAsModifiedUTF8(c, dos);
+            pos += StringUtils.charSize(data, pos);
+        }
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
new file mode 100644
index 0000000..666f6bb
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+    private static final long serialVersionUID = 1L;
+
+    public UTF8WordTokenFactory() {
+        super();
+    }
+
+    public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+        super(tokenTypeTag, countTypeTag);
+    }
+
+    @Override
+    public IToken createToken() {
+        return new UTF8WordToken(tokenTypeTag, countTypeTag);
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizer.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
new file mode 100644
index 0000000..963ad33
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+public class WordTokenizer implements Tokenizer {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    public static void main(String args[]) {
+        Tokenizer tokenizer = new WordTokenizer("_", '_');
+        String a = "hadoop_rocks_in_java";
+        System.out.println(a + ":" + tokenizer.tokenize(a));
+    }
+
+    private final String wordSeparator;
+    private final char tokenSeparator;
+
+    public WordTokenizer() {
+        this(" ", '_');
+    }
+
+    public WordTokenizer(String wordSeparator, char tokenSeparator) {
+        this.wordSeparator = wordSeparator;
+        this.tokenSeparator = tokenSeparator;
+    }
+
+    public List<String> tokenize(String input) {
+        final ArrayList<String> returnVect = new ArrayList<String>();
+        final HashMap<String, Integer> tokens = new HashMap<String, Integer>();
+        for (String term : input.split(wordSeparator)) {
+            if (term.length() == 0) {
+                continue;
+            }
+            Integer count = tokens.get(term);
+            if (count == null) {
+                count = 0;
+            }
+            count++;
+            tokens.put(term, count);
+            returnVect.add(term + tokenSeparator + count);
+        }
+        return returnVect;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
new file mode 100644
index 0000000..5a64d24
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
@@ -0,0 +1,92 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenizer;
+
+import edu.uci.ics.asterix.fuzzyjoin.IntArray;
+
+public class WordTokenizerBuffered implements TokenizerBuffered {
+
+    private final StringBuilder buffer;
+    private int index;
+    private final Token token;
+
+    private final IntArray tokensStart, tokensLength;
+
+    public WordTokenizerBuffered(StringBuilder buffer) {
+        this.buffer = buffer;
+        token = new Token();
+        tokensStart = new IntArray();
+        tokensLength = new IntArray();
+        reset();
+    }
+
+    @Override
+    public void advance() {
+        while (index < buffer.length() && TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
+            index++;
+        }
+        int start = index;
+        while (index < buffer.length() && !TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
+            buffer.setCharAt(index, Character.toLowerCase(buffer.charAt(index)));
+            index++;
+        }
+        int length = index - start;
+        int count = 1;
+        if (length > 0) {
+            // search if we got the same token before
+            for (int i = 0; i < tokensStart.length(); ++i) {
+                if (length == tokensLength.get(i)) {
+                    int tokenStart = tokensStart.get(i);
+                    count++; // assume we found it
+                    for (int j = 0; j < length; ++j) {
+                        if (buffer.charAt(start + j) != buffer.charAt(tokenStart + j)) {
+                            count--; // token not found
+                            break;
+                        }
+                    }
+                }
+            }
+            // add the new token to the list of seen tokens
+            tokensStart.add(start);
+            tokensLength.add(length);
+        }
+        // set token
+        token.set(buffer, start, length, count);
+    }
+
+    @Override
+    public boolean end() {
+        return token.length() <= 0;
+    }
+
+    @Override
+    public Token getToken() {
+        return token;
+    }
+
+    @Override
+    public void reset() {
+        index = 0;
+        tokensStart.reset();
+        tokensLength.reset();
+        advance();
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
new file mode 100644
index 0000000..d8a9185
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.Serializable;
+
+public interface IntTokenCountRank extends Serializable {
+    public int add(int token, int count);
+
+    public int getRank(int token, int count);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
new file mode 100644
index 0000000..acefb51
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.HashMap;
+
+import edu.uci.ics.asterix.fuzzyjoin.IntPair;
+
+public class IntTokenCountRankFrequency implements IntTokenCountRank {
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    private final HashMap<IntPair, Integer> ranksMap = new HashMap<IntPair, Integer>();
+    private final IntPair tmpPair = new IntPair();
+    private int crtRank = 0;
+
+    @Override
+    public int add(int token, int count) {
+        int prevRank = crtRank;
+        ranksMap.put(new IntPair(token, count), prevRank);
+        crtRank++;
+        return prevRank;
+    }
+
+    @Override
+    public int getRank(int token, int count) {
+        tmpPair.set(token, count);
+        Integer rank = ranksMap.get(tmpPair);
+        if (rank == null) {
+            return -1;
+        }
+        return rank;
+    }
+
+    @Override
+    public String toString() {
+        return "[" + crtRank + ",\n " + ranksMap + "\n]";
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRank.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
new file mode 100644
index 0000000..a70a4d9
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.Serializable;
+
+public interface IntTokenRank extends Serializable {
+    public int add(int token);
+
+    public int getRank(int token);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
new file mode 100644
index 0000000..da83a69
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.HashMap;
+
+public class IntTokenRankFrequency implements IntTokenRank {
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    private final HashMap<Integer, Integer> ranksMap = new HashMap<Integer, Integer>();
+    private int crtRank = 0;
+
+    @Override
+    public int add(int token) {
+        int prevRank = crtRank;
+        ranksMap.put(token, prevRank);
+        crtRank++;
+        return prevRank;
+    }
+
+    @Override
+    public int getRank(int token) {
+        Integer rank = ranksMap.get(token);
+        if (rank == null) {
+            return -1;
+        }
+        return rank;
+    }
+
+    @Override
+    public String toString() {
+        return "[" + crtRank + ",\n " + ranksMap + "\n]";
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenLoad.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenLoad.java
new file mode 100644
index 0000000..da6e47c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenLoad.java
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Serializable;
+
+import edu.uci.ics.asterix.fuzzyjoin.FuzzyJoinConfig;
+
+public class TokenLoad implements Serializable {
+    private final String path;
+    private final TokenRank rank;
+
+    public TokenLoad(String path, TokenRank rank) {
+        this.path = path;
+        this.rank = rank;
+    }
+
+    public void loadTokenRank() {
+        loadTokenRank(1);
+    }
+
+    public void loadTokenRank(int factor) {
+        try {
+            BufferedReader fis = new BufferedReader(
+            // new FileReader(path.toString())
+                    new InputStreamReader(new FileInputStream(path), "UTF-8"));
+            String token = null;
+            while ((token = fis.readLine()) != null) {
+                rank.add(token);
+                // only used when increasing the token dictionary
+                for (int i = 1; i < factor; i++) {
+                    // remove _COUNT at the end of the token (it is removed in
+                    // the new records anyway)
+                    rank.add(token.split(FuzzyJoinConfig.TOKEN_SEPARATOR_REGEX)[0] + i);
+                }
+            }
+        } catch (IOException ioe) {
+            throw new RuntimeException(ioe);
+        }
+    }
+}
\ No newline at end of file
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRank.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRank.java
new file mode 100644
index 0000000..6222bea
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRank.java
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.io.Serializable;
+import java.util.Collection;
+
+public interface TokenRank extends Serializable {
+    public int add(String token);
+
+    public Integer getRank(String token);
+
+    public Collection<Integer> getTokenRanks(Iterable<String> tokens);
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
new file mode 100644
index 0000000..ab7bf2f
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.Collection;
+import java.util.HashMap;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.Token;
+
+public class TokenRankBufferedFrequency implements TokenRank {
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    private final HashMap<Token, Integer> ranksMap = new HashMap<Token, Integer>();
+    private int crtRank = 0;
+
+    public int add(String stringWithCount) {
+        int end = stringWithCount.lastIndexOf('_');
+        int count = 0;
+        for (int i = end + 1; i < stringWithCount.length(); ++i) {
+            count = count * 10 + (stringWithCount.charAt(i) - '0');
+        }
+        return add(stringWithCount.substring(0, end), count);
+    }
+
+    public int add(String string, int count) {
+        Token token = new Token(string, 0, string.length(), count);
+        return add(token);
+    }
+
+    public int add(Token token) {
+        int prevRank = crtRank;
+        ranksMap.put(token, prevRank);
+        crtRank++;
+        return prevRank;
+    }
+
+    @Override
+    public Integer getRank(String token) {
+        throw new UnsupportedOperationException();
+    }
+
+    public Integer getRank(Token token) {
+        return ranksMap.get(token);
+    }
+
+    @Override
+    public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public String toString() {
+        return "[" + crtRank + ",\n " + ranksMap + "\n]";
+    }
+}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
new file mode 100644
index 0000000..0ae6c35
--- /dev/null
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tokenorder;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.TreeSet;
+
+public class TokenRankFrequency implements TokenRank {
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    private final HashMap<String, Integer> ranksMap = new HashMap<String, Integer>();
+    private int crtRank = 0;
+
+    public int add(String token) {
+        int prevRank = crtRank;
+        ranksMap.put(token, prevRank);
+        crtRank++;
+        return prevRank;
+    }
+
+    public Integer getRank(String token) {
+        return ranksMap.get(token);
+    }
+
+    public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
+        TreeSet<Integer> ranksCol = new TreeSet<Integer>();
+        for (String token : tokens) {
+            Integer rank = getRank(token);
+            if (rank != null) {
+                ranksCol.add(rank);
+            }
+        }
+        return ranksCol;
+    }
+
+    @Override
+    public String toString() {
+        return "[" + crtRank + ",\n " + ranksMap + "\n]";
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java
new file mode 100644
index 0000000..f5a8cec9
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.util.ArrayList;
+
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.FuzzyJoinMemory;
+import edu.uci.ics.asterix.fuzzyjoin.ResultSelfJoin;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset.Directory;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.DBLPSmallDataset;
+
+public class FuzzyJoinTest {
+
+    private static final AbstractDataset dataset = new DBLPSmallDataset();
+    private static final String base = "data/";
+
+    @Test
+    public void test() throws Exception {
+
+        ArrayList<int[]> records = new ArrayList<int[]>();
+        ArrayList<Integer> rids = new ArrayList<Integer>();
+        ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+
+        dataset.createDirecotries(new String[] { base });
+
+        FuzzyJoinMemory fj = new FuzzyJoinMemory(dataset.getThreshold());
+
+        FuzzyJoinMemory.readRecords(base + dataset.getPathPart0(Directory.SSJOININ), records, rids);
+
+        for (int[] record : records) {
+            results.addAll(fj.selfJoinAndAddRecord(record));
+        }
+
+        BufferedWriter out = new BufferedWriter(new FileWriter(base + dataset.getPathPart0(Directory.SSJOINOUT)));
+        for (ResultSelfJoin result : results) {
+            out.write(String.format("%d %d %.3f\n", rids.get(result.indexX), rids.get(result.indexY), result.similarity));
+        }
+        out.close();
+
+        FuzzyJoinTestUtil.verifyDirectory(base + dataset.getPathPart0(Directory.SSJOINOUT),
+                base + dataset.getPathExpected(Directory.SSJOINOUT));
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java
new file mode 100644
index 0000000..db44850
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashSet;
+
+import org.junit.Assert;
+
+public class FuzzyJoinTestUtil {
+
+    public static void verifyDirectory(String pathTest, String pathCorrect)
+            throws IOException {
+        verifyDirectory(pathTest, pathCorrect, false);
+    }
+
+    public static void verifyDirectory(String pathTest, String pathCorrect,
+            boolean noDup) throws IOException {
+        int countTest = 0, countTestDedup = 0, countCorrect = 0;
+
+        BufferedReader input;
+        String line;
+        HashSet<String> buffer = new HashSet<String>();
+
+        // buffer Test
+        input = new BufferedReader(new FileReader(pathTest));
+        while ((line = input.readLine()) != null) {
+            buffer.add(line);
+            countTest++;
+        }
+        countTestDedup = buffer.size();
+
+        // probe Correct
+        input = new BufferedReader(new FileReader(new File(pathCorrect)));
+        while ((line = input.readLine()) != null) {
+            Assert.assertTrue(buffer.contains(line));
+            countCorrect++;
+        }
+
+        // check counts
+        Assert.assertEquals(countTestDedup, countCorrect);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
new file mode 100644
index 0000000..e65bb25
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
@@ -0,0 +1,239 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8NGramTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8NGramTokenFactory;
+
+public class NGramTokenizerTest {
+
+    private char PRECHAR = '#';
+    private char POSTCHAR = '$';
+
+    private String str = "Jürgen S. Generic's Car";
+    private byte[] inputBuffer;
+
+    private int gramLength = 3;
+
+    private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
+
+        String tmp = s.toLowerCase();
+        if (prePost) {
+            StringBuilder preBuilder = new StringBuilder();
+            for (int i = 0; i < gramLength - 1; i++) {
+                preBuilder.append(PRECHAR);
+            }
+            String pre = preBuilder.toString();
+
+            StringBuilder postBuilder = new StringBuilder();
+            for (int i = 0; i < gramLength - 1; i++) {
+                postBuilder.append(POSTCHAR);
+            }
+            String post = postBuilder.toString();
+
+            tmp = pre + s.toLowerCase() + post;
+        }
+
+        for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
+            String gram = tmp.substring(i, i + gramLength);
+            grams.add(gram);
+        }
+    }
+
+    @Before
+    public void init() throws Exception {
+        // serialize string into bytes
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutput dos = new DataOutputStream(baos);
+        dos.writeUTF(str);
+        inputBuffer = baos.toByteArray();
+    }
+
+    void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
+        HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
+                false, tokenFactory);
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        ArrayList<String> expectedGrams = new ArrayList<String>();
+        getExpectedGrams(str, gramLength, expectedGrams, prePost);
+        ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+        HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
+        for (String s : expectedGrams) {
+            Integer count = gramCounts.get(s);
+            if (count == null) {
+                count = 1;
+                gramCounts.put(s, count);
+            } else {
+                count++;
+            }
+
+            int hash = tokenHash(s, count);
+            expectedHashedGrams.add(hash);
+        }
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedGram = in.readInt();
+
+            // System.out.println(hashedGram);
+
+            Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+            tokenCount++;
+        }
+        // System.out.println("---------");
+    }
+
+    void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
+        HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+                tokenFactory);
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        ArrayList<String> expectedGrams = new ArrayList<String>();
+        getExpectedGrams(str, gramLength, expectedGrams, prePost);
+        ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+        for (String s : expectedGrams) {
+            int hash = tokenHash(s, 1);
+            expectedHashedGrams.add(hash);
+        }
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedGram = in.readInt();
+
+            // System.out.println(hashedGram);
+
+            Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+            tokenCount++;
+        }
+        // System.out.println("---------");
+    }
+
+    void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
+        UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
+        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+                tokenFactory);
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        ArrayList<String> expectedGrams = new ArrayList<String>();
+        getExpectedGrams(str, gramLength, expectedGrams, prePost);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            String strGram = in.readUTF();
+
+            // System.out.println("\"" + strGram + "\"");
+
+            Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
+
+            tokenCount++;
+        }
+        // System.out.println("---------");
+    }
+
+    @Test
+    public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
+        runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
+        runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
+    }
+
+    @Test
+    public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
+        runTestNGramTokenizerWithHashedUTF8Tokens(false);
+        runTestNGramTokenizerWithHashedUTF8Tokens(true);
+    }
+
+    @Test
+    public void testNGramTokenizerWithUTF8Tokens() throws IOException {
+        runTestNGramTokenizerWithUTF8Tokens(false);
+        runTestNGramTokenizerWithUTF8Tokens(true);
+    }
+
+    public int tokenHash(String token, int tokenCount) {
+        int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+        for (int i = 0; i < token.length(); i++) {
+            h ^= token.charAt(i);
+            h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+        }
+        return h + tokenCount;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java
new file mode 100644
index 0000000..8fd05da
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java
@@ -0,0 +1,214 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8WordTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
+
+public class WordTokenizerTest {
+
+    private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
+    private byte[] inputBuffer;
+
+    private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
+    private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
+    private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
+
+    @Before
+    public void init() throws IOException {
+        // serialize text into bytes
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutput dos = new DataOutputStream(baos);
+        dos.writeUTF(text);
+        inputBuffer = baos.toByteArray();
+
+        // init expected string tokens
+        expectedUTF8Tokens.add("hello");
+        expectedUTF8Tokens.add("world");
+        expectedUTF8Tokens.add("i");
+        expectedUTF8Tokens.add("would");
+        expectedUTF8Tokens.add("like");
+        expectedUTF8Tokens.add("to");
+        expectedUTF8Tokens.add("inform");
+        expectedUTF8Tokens.add("you");
+        expectedUTF8Tokens.add("of");
+        expectedUTF8Tokens.add("the");
+        expectedUTF8Tokens.add("importance");
+        expectedUTF8Tokens.add("of");
+        expectedUTF8Tokens.add("foo");
+        expectedUTF8Tokens.add("bar");
+        expectedUTF8Tokens.add("yes");
+        expectedUTF8Tokens.add("foo");
+        expectedUTF8Tokens.add("bar");
+        expectedUTF8Tokens.add("jürgen");
+
+        // hashed tokens ignoring token count
+        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+            int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
+            expectedHashedUTF8Tokens.add(hash);
+        }
+
+        // hashed tokens using token count
+        HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
+        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+            Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
+            if (count == null) {
+                count = 1;
+                tokenCounts.put(expectedUTF8Tokens.get(i), count);
+            } else {
+                count++;
+            }
+
+            int hash = tokenHash(expectedUTF8Tokens.get(i), count);
+            expectedCountedHashedUTF8Tokens.add(hash);
+        }
+    }
+
+    @Test
+    public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
+
+        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
+                tokenFactory);
+
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedToken = in.readInt();
+
+            // System.out.println(hashedToken);
+
+            Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
+
+            tokenCount++;
+        }
+    }
+
+    @Test
+    public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
+
+        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedToken = in.readInt();
+
+            // System.out.println(hashedToken);
+
+            Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
+
+            tokenCount++;
+        }
+    }
+
+    @Test
+    public void testWordTokenizerWithUTF8Tokens() throws IOException {
+
+        UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            String strToken = in.readUTF();
+
+            // System.out.println(strToken);
+
+            Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
+
+            tokenCount++;
+        }
+    }
+
+    // JAQL
+    public int tokenHash(String token, int tokenCount) {
+        int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+        for (int i = 0; i < token.length(); i++) {
+            h ^= token.charAt(i);
+            h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+        }
+        return h + tokenCount;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java
new file mode 100644
index 0000000..5ca6c6d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.io.File;
+import java.util.NoSuchElementException;
+
+public abstract class AbstractDataset {
+    public static enum Directory {
+        RAW_R,
+        RAW_S,
+        RECORDPAIRS,
+        RECORDS_R,
+        RECORDS_S,
+        RECORDSBULK_R,
+        RECORDSBULK_S,
+        RIDPAIRS,
+        SSJOININ,
+        SSJOINOUT,
+        TOKENS,
+        TOKENS_R,
+        TOKENS_R_AQL,
+    }
+
+    public static enum Relation {
+        R, S,
+    }
+
+    public static final String FILE_PART = "part-";
+    public static final String FILE_PART0 = FILE_PART + "00000";
+    public static final String FILE_EXPECTED = "expected.txt";
+    public static final String AQL = "aql";
+
+    public static final String PATH_RAW = "raw";
+    public static final String PATH_RECORDPAIRS = "recordpairs";
+    public static final String PATH_RECORDS = "records";
+    public static final String PATH_RECORDSBULK = "recordsbulk";
+    public static final String PATH_RIDPAIRS = "ridpairs";
+    public static final String PATH_SSJOININ = "ssjoin.in";
+    public static final String PATH_SSJOINOUT = "ssjoin.out";
+    public static final String PATH_TOKENS = "tokens";
+
+    public static final String DIRECTORY_ID_FORMAT = "%03d";
+
+    public void createDirecotries(String[] paths) {
+        createDirecotries(paths, 0);
+    }
+
+    public void createDirecotries(String[] paths, int crtCopy) {
+        (new File(paths[0] + getPathDirecotry(Directory.SSJOINOUT, 0))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_R, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_S, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDS_R, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDS_S, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.TOKENS, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R_AQL, crtCopy))).mkdir();
+    }
+
+    public abstract String getName();
+
+    public abstract int getNoRecords();
+
+    public abstract String getPath();
+
+    public String getPathDirecotry(Directory directory, int crtCopy) {
+        return getPathDirectory(getPath(), directory, crtCopy);
+    }
+
+    private String getPathDirectory(Directory directory, int crtCopy, boolean expected) {
+        return getPathDirectory(getName() + (expected ? ".expected" : ""), directory, crtCopy);
+    }
+
+    public String getPathDirectory(String path, Directory directory, int crtCopy) {
+        path += '/';
+        switch (directory) {
+            case SSJOININ:
+                path += AbstractDataset.PATH_SSJOININ;
+                break;
+            case SSJOINOUT:
+                path += AbstractDataset.PATH_SSJOINOUT;
+                break;
+            case RAW_R:
+                path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.R);
+                break;
+            case RAW_S:
+                path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.S);
+                break;
+            case RECORDSBULK_R:
+                path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.R);
+                break;
+            case RECORDSBULK_S:
+                path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.S);
+                break;
+            case RECORDS_R:
+                path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.R);
+                break;
+            case RECORDS_S:
+                path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.S);
+                break;
+            case TOKENS:
+                path += AbstractDataset.PATH_TOKENS;
+                break;
+            case TOKENS_R:
+                path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R);
+                break;
+            case TOKENS_R_AQL:
+                path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R) + "." + AQL;
+                break;
+            case RIDPAIRS:
+                path += AbstractDataset.PATH_RIDPAIRS;
+                break;
+            case RECORDPAIRS:
+                path += AbstractDataset.PATH_RECORDPAIRS;
+                break;
+            default:
+                throw new NoSuchElementException();
+        }
+        return path + "-" + String.format(DIRECTORY_ID_FORMAT, crtCopy);
+    }
+
+    public String getPathExpected(Directory directory) {
+        return getPathDirectory(directory, 0, true) + '/' + FILE_EXPECTED;
+    }
+
+    public String getPathPart(Directory directory, int crtCopy) {
+        return getPathDirecotry(directory, crtCopy) + '/' + FILE_PART;
+    }
+
+    public String getPathPart0(Directory directory) {
+        return getPathDirectory(directory, 0, false) + '/' + FILE_PART0;
+    }
+
+    public String getPathPart0(Directory directory, boolean expected) {
+        return getPathDirectory(directory, 0, expected) + '/' + (expected ? FILE_EXPECTED : FILE_PART0);
+    }
+
+    public abstract String getSuffix(Relation relation);
+
+    public abstract float getThreshold();
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java
new file mode 100644
index 0000000..5333cad
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java
@@ -0,0 +1,5 @@
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public abstract class AbstractTokenizableDataset extends AbstractDataset {
+    public abstract String getRecordData();
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java
new file mode 100644
index 0000000..3783829
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+
+public class DBLPDataset extends PublicationsDataset {
+    private static final String NAME = "dblp";
+    private static final int NO_RECORDS = 1268017;
+    private static final float THRESHOLD = .8f;
+    private static final String RECORD_DATA = "2,3";
+
+    public DBLPDataset() {
+        super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, NAME, NAME);
+    }
+
+    public DBLPDataset(String recordData) {
+        super(NAME, NO_RECORDS, THRESHOLD, recordData, NAME, NAME);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java
new file mode 100644
index 0000000..5eaebd2
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class DBLPSmallDataset extends PublicationsDataset {
+    public DBLPSmallDataset() {
+        super("dblp-small", 100, .5f, "2,3", "dblp", "dblp");
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java
new file mode 100644
index 0000000..38aad37
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArrayBagSmallDataset extends AbstractDataset {
+    private final int NO_RECORDS = 4;
+    private final String NAME = "intarray-bag-small";
+    private final String PATH = NAME;
+    private final float THRESHOLD = .5f;
+
+    public IntArrayBagSmallDataset() {
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return NO_RECORDS;
+    }
+
+    @Override
+    public String getPath() {
+        return PATH;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        return "r";
+    }
+
+    @Override
+    public float getThreshold() {
+        return THRESHOLD;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java
new file mode 100644
index 0000000..7c8c80d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArraySetSmallDataset extends AbstractDataset {
+    private final int NO_RECORDS = 4;
+    private final String NAME = "intarray-set-small";
+    private final String PATH = NAME;
+    private final float THRESHOLD = .5f;
+
+    public IntArraySetSmallDataset() {
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return NO_RECORDS;
+    }
+
+    @Override
+    public String getPath() {
+        return PATH;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        return "r";
+    }
+
+    @Override
+    public float getThreshold() {
+        return THRESHOLD;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java
new file mode 100644
index 0000000..e8c2f2a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBDataset extends PublicationsDataset {
+    private static final String DBLP_SUFFIX = "dblp";
+    private static final String CSX_SUFFIX = "csx";
+    private static final String NAME = "pub";
+    private static final int NO_RECORDS = 1385532;
+    private static final float THRESHOLD = .8f;
+    private static final String RECORD_DATA = "2,3";
+
+    public PUBDataset() {
+        super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+    }
+
+    public PUBDataset(float threshold) {
+        super(NAME, NO_RECORDS, threshold, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+    }
+
+    public PUBDataset(float threshold, String recordData) {
+        super(NAME, NO_RECORDS, threshold, recordData, DBLP_SUFFIX, CSX_SUFFIX);
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java
new file mode 100644
index 0000000..eed28e4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBSmallDataset extends PublicationsDataset {
+    public PUBSmallDataset() {
+        super("pub-small", 100, .5f, "2,3", "dblp", "csx");
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java
new file mode 100644
index 0000000..e1653cd
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class PublicationsDataset extends AbstractTokenizableDataset {
+    protected final String name;
+    protected final String path;
+    protected final int noRecords;
+    protected final float threshold;
+    protected final String recordData;
+    protected final String rSuffix, sSuffix;
+
+    public PublicationsDataset(String name, int noRecords, float threshold, String recordData, String rSuffix,
+            String sSuffix) {
+        this.name = name;
+        this.noRecords = noRecords;
+        this.threshold = threshold;
+        this.recordData = recordData;
+        this.rSuffix = rSuffix;
+        this.sSuffix = sSuffix;
+
+        path = name;
+    }
+
+    @Override
+    public String getName() {
+        return name;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return noRecords;
+    }
+
+    @Override
+    public String getPath() {
+        return path;
+    }
+
+    @Override
+    public String getRecordData() {
+        return recordData;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        switch (relation) {
+            case R:
+                return rSuffix;
+            case S:
+                return sSuffix;
+            default:
+                throw new NoSuchElementException();
+        }
+    }
+
+    @Override
+    public float getThreshold() {
+        return threshold;
+    }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java
new file mode 100644
index 0000000..6463b2d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class UsersVisitorsSmallDataset extends AbstractDataset {
+    private final int NO_RECORDS = 4;
+    private final String NAME = "users-visitors-small";
+    private static final String USERS_SUFFIX = "users";
+    private static final String VISITORS_SUFFIX = "visitors";
+    private final String PATH = NAME;
+    private final float THRESHOLD = .5f;
+
+    public UsersVisitorsSmallDataset() {
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return NO_RECORDS;
+    }
+
+    @Override
+    public String getPath() {
+        return PATH;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        switch (relation) {
+            case R:
+                return USERS_SUFFIX;
+            case S:
+                return VISITORS_SUFFIX;
+            default:
+                throw new NoSuchElementException();
+        }
+    }
+
+    @Override
+    public float getThreshold() {
+        return THRESHOLD;
+    }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/scripts/conf.sh b/asterix-fuzzyjoin/src/test/scripts/conf.sh
new file mode 100644
index 0000000..45e962c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/conf.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+### http://www.cse.unsw.edu.au/~weiw/project/simjoin.html
+SSJOIN=/home/rares/workspace/ssjoin-bin
+DATA=../data
+
+IN=ssjoin.in-000
+OUT=ssjoin.out-000
diff --git a/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh
new file mode 100755
index 0000000..0cd6ccc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1                   # Required number of arguments
+E_BADARGS=85             # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+  echo "Usage:   `basename $0` dataset"
+  echo "Example: `basename $0` dblp-small"
+  exit $E_BADARGS
+fi
+
+THR="0.80"
+if [ "$1" == "dblp-small" ]; then
+    THR="0.50"
+fi
+
+
+mkdir $DATA/$1.expected/$OUT
+$SSJOIN/ppjoinplus j $THR $DATA/$1/$IN/part-00000 | \
+    sed 's/0\.812/0\.813/' | \
+    sort > $DATA/$1.expected/$OUT/expected.txt
+
+mkdir $DATA/$1/$OUT
+java \
+    -Xmx2g \
+    -jar $DIR/../../../target/fuzzyjoin-core-0.0.1.jar \
+    $THR $DATA/$1/$IN/part-00000 | \
+    sort > $DATA/$1/$OUT/part-00000
+
+diff $DATA/$1.expected/$OUT/expected.txt $DATA/$1/$OUT/part-00000
diff --git a/asterix-fuzzyjoin/src/test/scripts/inmemory.sh b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh
new file mode 100755
index 0000000..c9394d4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DATA="/home/rares/data/fuzzyjoin/dblp/records-024"
+FUZZYJOIN="/home/rares/fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar"
+
+echo "-- - Step 0: Project and append length - --"
+
+# java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinAppendLength $DATA/part-00000 $DATA/part-00000-len
+
+date
+
+echo "== START =="
+
+echo "-- - Step 1: Sort by length - --"
+
+# time sort -n -k 5 -t ":" $DATA/part-00000-len > $DATA/part-00000-len-sorted
+
+echo "-- - Step 2: Tokenize - --"
+
+# time java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinTokenize $DATA/part-00000-len-sorted $DATA/part-00000-tokens $DATA/part-00000-tokenized
+
+echo "-- - Step 3: RID pairs - --"
+
+time java -Xmx8g -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinMemory .8 $DATA/part-00000-tokenized > $DATA/part-00000-ridpairs
+
+echo "== END =="
+
+date
+
+
+### SSJoin ###
+# cut -d ":" -f 3,4 records-000/part-0000* > ! ssjoin.raw-000/part-00000
+# ~/workspace/ssjoin-bin/txtformat ssjoin.raw-000/part-00000 ssjoin.norm-000/part-00000 l
+# sed 's/_\+/ /g' ssjoin.norm-000/part-00000 > ! ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/tokenizer ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/ppjoinplus j .8 ssjoin.space-000/part-00000.bin > /dev/null
+# java -jar /fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar .8 ssjoin.space-000/part-00000.bin > /dev/null
diff --git a/asterix-fuzzyjoin/src/test/scripts/tokenize.sh b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh
new file mode 100755
index 0000000..5498d44
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1                   # Required number of arguments
+E_BADARGS=85             # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+  echo "Usage:   `basename $0` dataset"
+  echo "Example: `basename $0` dblp-small"
+  exit $E_BADARGS
+fi
+
+$SSJOIN/tokenizer $DATA/$1/raw-000/part-00000 $2
+mkdir $DATA/$1/$IN
+mv $DATA/$1/raw-000/part-00000.bin $DATA/$1/$IN/part-00000
+