adding fuzzyjoin code to git

commit: 82838a2cc0c73a40912a5e9fb4b5cd2027ba7d21 [log] [tgz]
author: icetindil <icetindil@gmail.com> Fri Oct 11 16:41:18 2013 -0700
committer: icetindil <icetindil@gmail.com> Wed Jan 29 00:32:16 2014 -0800
tree: efa719584a01ff3ac25015f7cd90624b8247f2e8
parent: aa89009750bf17e24d07883f61fe0c0296f4d3a3 [diff]
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java
new file mode 100644
index 0000000..f5a8cec
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java

@@ -0,0 +1,65 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.util.ArrayList;
+
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.FuzzyJoinMemory;
+import edu.uci.ics.asterix.fuzzyjoin.ResultSelfJoin;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset.Directory;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.DBLPSmallDataset;
+
+public class FuzzyJoinTest {
+
+    private static final AbstractDataset dataset = new DBLPSmallDataset();
+    private static final String base = "data/";
+
+    @Test
+    public void test() throws Exception {
+
+        ArrayList<int[]> records = new ArrayList<int[]>();
+        ArrayList<Integer> rids = new ArrayList<Integer>();
+        ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+
+        dataset.createDirecotries(new String[] { base });
+
+        FuzzyJoinMemory fj = new FuzzyJoinMemory(dataset.getThreshold());
+
+        FuzzyJoinMemory.readRecords(base + dataset.getPathPart0(Directory.SSJOININ), records, rids);
+
+        for (int[] record : records) {
+            results.addAll(fj.selfJoinAndAddRecord(record));
+        }
+
+        BufferedWriter out = new BufferedWriter(new FileWriter(base + dataset.getPathPart0(Directory.SSJOINOUT)));
+        for (ResultSelfJoin result : results) {
+            out.write(String.format("%d %d %.3f\n", rids.get(result.indexX), rids.get(result.indexY), result.similarity));
+        }
+        out.close();
+
+        FuzzyJoinTestUtil.verifyDirectory(base + dataset.getPathPart0(Directory.SSJOINOUT),
+                base + dataset.getPathExpected(Directory.SSJOINOUT));
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java
new file mode 100644
index 0000000..db44850
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java

@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashSet;
+
+import org.junit.Assert;
+
+public class FuzzyJoinTestUtil {
+
+    public static void verifyDirectory(String pathTest, String pathCorrect)
+            throws IOException {
+        verifyDirectory(pathTest, pathCorrect, false);
+    }
+
+    public static void verifyDirectory(String pathTest, String pathCorrect,
+            boolean noDup) throws IOException {
+        int countTest = 0, countTestDedup = 0, countCorrect = 0;
+
+        BufferedReader input;
+        String line;
+        HashSet<String> buffer = new HashSet<String>();
+
+        // buffer Test
+        input = new BufferedReader(new FileReader(pathTest));
+        while ((line = input.readLine()) != null) {
+            buffer.add(line);
+            countTest++;
+        }
+        countTestDedup = buffer.size();
+
+        // probe Correct
+        input = new BufferedReader(new FileReader(new File(pathCorrect)));
+        while ((line = input.readLine()) != null) {
+            Assert.assertTrue(buffer.contains(line));
+            countCorrect++;
+        }
+
+        // check counts
+        Assert.assertEquals(countTestDedup, countCorrect);
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
new file mode 100644
index 0000000..e65bb25
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java

@@ -0,0 +1,239 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8NGramTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8NGramTokenFactory;
+
+public class NGramTokenizerTest {
+
+    private char PRECHAR = '#';
+    private char POSTCHAR = '$';
+
+    private String str = "Jürgen S. Generic's Car";
+    private byte[] inputBuffer;
+
+    private int gramLength = 3;
+
+    private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
+
+        String tmp = s.toLowerCase();
+        if (prePost) {
+            StringBuilder preBuilder = new StringBuilder();
+            for (int i = 0; i < gramLength - 1; i++) {
+                preBuilder.append(PRECHAR);
+            }
+            String pre = preBuilder.toString();
+
+            StringBuilder postBuilder = new StringBuilder();
+            for (int i = 0; i < gramLength - 1; i++) {
+                postBuilder.append(POSTCHAR);
+            }
+            String post = postBuilder.toString();
+
+            tmp = pre + s.toLowerCase() + post;
+        }
+
+        for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
+            String gram = tmp.substring(i, i + gramLength);
+            grams.add(gram);
+        }
+    }
+
+    @Before
+    public void init() throws Exception {
+        // serialize string into bytes
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutput dos = new DataOutputStream(baos);
+        dos.writeUTF(str);
+        inputBuffer = baos.toByteArray();
+    }
+
+    void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
+        HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
+                false, tokenFactory);
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        ArrayList<String> expectedGrams = new ArrayList<String>();
+        getExpectedGrams(str, gramLength, expectedGrams, prePost);
+        ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+        HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
+        for (String s : expectedGrams) {
+            Integer count = gramCounts.get(s);
+            if (count == null) {
+                count = 1;
+                gramCounts.put(s, count);
+            } else {
+                count++;
+            }
+
+            int hash = tokenHash(s, count);
+            expectedHashedGrams.add(hash);
+        }
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedGram = in.readInt();
+
+            // System.out.println(hashedGram);
+
+            Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+            tokenCount++;
+        }
+        // System.out.println("---------");
+    }
+
+    void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
+        HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+                tokenFactory);
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        ArrayList<String> expectedGrams = new ArrayList<String>();
+        getExpectedGrams(str, gramLength, expectedGrams, prePost);
+        ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+        for (String s : expectedGrams) {
+            int hash = tokenHash(s, 1);
+            expectedHashedGrams.add(hash);
+        }
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedGram = in.readInt();
+
+            // System.out.println(hashedGram);
+
+            Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+            tokenCount++;
+        }
+        // System.out.println("---------");
+    }
+
+    void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
+        UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
+        NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+                tokenFactory);
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        ArrayList<String> expectedGrams = new ArrayList<String>();
+        getExpectedGrams(str, gramLength, expectedGrams, prePost);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            String strGram = in.readUTF();
+
+            // System.out.println("\"" + strGram + "\"");
+
+            Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
+
+            tokenCount++;
+        }
+        // System.out.println("---------");
+    }
+
+    @Test
+    public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
+        runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
+        runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
+    }
+
+    @Test
+    public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
+        runTestNGramTokenizerWithHashedUTF8Tokens(false);
+        runTestNGramTokenizerWithHashedUTF8Tokens(true);
+    }
+
+    @Test
+    public void testNGramTokenizerWithUTF8Tokens() throws IOException {
+        runTestNGramTokenizerWithUTF8Tokens(false);
+        runTestNGramTokenizerWithUTF8Tokens(true);
+    }
+
+    public int tokenHash(String token, int tokenCount) {
+        int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+        for (int i = 0; i < token.length(); i++) {
+            h ^= token.charAt(i);
+            h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+        }
+        return h + tokenCount;
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java
new file mode 100644
index 0000000..8fd05da
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java

@@ -0,0 +1,214 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8WordTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
+
+public class WordTokenizerTest {
+
+    private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
+    private byte[] inputBuffer;
+
+    private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
+    private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
+    private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
+
+    @Before
+    public void init() throws IOException {
+        // serialize text into bytes
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutput dos = new DataOutputStream(baos);
+        dos.writeUTF(text);
+        inputBuffer = baos.toByteArray();
+
+        // init expected string tokens
+        expectedUTF8Tokens.add("hello");
+        expectedUTF8Tokens.add("world");
+        expectedUTF8Tokens.add("i");
+        expectedUTF8Tokens.add("would");
+        expectedUTF8Tokens.add("like");
+        expectedUTF8Tokens.add("to");
+        expectedUTF8Tokens.add("inform");
+        expectedUTF8Tokens.add("you");
+        expectedUTF8Tokens.add("of");
+        expectedUTF8Tokens.add("the");
+        expectedUTF8Tokens.add("importance");
+        expectedUTF8Tokens.add("of");
+        expectedUTF8Tokens.add("foo");
+        expectedUTF8Tokens.add("bar");
+        expectedUTF8Tokens.add("yes");
+        expectedUTF8Tokens.add("foo");
+        expectedUTF8Tokens.add("bar");
+        expectedUTF8Tokens.add("jürgen");
+
+        // hashed tokens ignoring token count
+        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+            int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
+            expectedHashedUTF8Tokens.add(hash);
+        }
+
+        // hashed tokens using token count
+        HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
+        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+            Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
+            if (count == null) {
+                count = 1;
+                tokenCounts.put(expectedUTF8Tokens.get(i), count);
+            } else {
+                count++;
+            }
+
+            int hash = tokenHash(expectedUTF8Tokens.get(i), count);
+            expectedCountedHashedUTF8Tokens.add(hash);
+        }
+    }
+
+    @Test
+    public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
+
+        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
+                tokenFactory);
+
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedToken = in.readInt();
+
+            // System.out.println(hashedToken);
+
+            Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
+
+            tokenCount++;
+        }
+    }
+
+    @Test
+    public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
+
+        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            Integer hashedToken = in.readInt();
+
+            // System.out.println(hashedToken);
+
+            Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
+
+            tokenCount++;
+        }
+    }
+
+    @Test
+    public void testWordTokenizerWithUTF8Tokens() throws IOException {
+
+        UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+        int tokenCount = 0;
+
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
+
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
+
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
+
+            String strToken = in.readUTF();
+
+            // System.out.println(strToken);
+
+            Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
+
+            tokenCount++;
+        }
+    }
+
+    // JAQL
+    public int tokenHash(String token, int tokenCount) {
+        int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+        for (int i = 0; i < token.length(); i++) {
+            h ^= token.charAt(i);
+            h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+        }
+        return h + tokenCount;
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java
new file mode 100644
index 0000000..5ca6c6d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java

@@ -0,0 +1,158 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.io.File;
+import java.util.NoSuchElementException;
+
+public abstract class AbstractDataset {
+    public static enum Directory {
+        RAW_R,
+        RAW_S,
+        RECORDPAIRS,
+        RECORDS_R,
+        RECORDS_S,
+        RECORDSBULK_R,
+        RECORDSBULK_S,
+        RIDPAIRS,
+        SSJOININ,
+        SSJOINOUT,
+        TOKENS,
+        TOKENS_R,
+        TOKENS_R_AQL,
+    }
+
+    public static enum Relation {
+        R, S,
+    }
+
+    public static final String FILE_PART = "part-";
+    public static final String FILE_PART0 = FILE_PART + "00000";
+    public static final String FILE_EXPECTED = "expected.txt";
+    public static final String AQL = "aql";
+
+    public static final String PATH_RAW = "raw";
+    public static final String PATH_RECORDPAIRS = "recordpairs";
+    public static final String PATH_RECORDS = "records";
+    public static final String PATH_RECORDSBULK = "recordsbulk";
+    public static final String PATH_RIDPAIRS = "ridpairs";
+    public static final String PATH_SSJOININ = "ssjoin.in";
+    public static final String PATH_SSJOINOUT = "ssjoin.out";
+    public static final String PATH_TOKENS = "tokens";
+
+    public static final String DIRECTORY_ID_FORMAT = "%03d";
+
+    public void createDirecotries(String[] paths) {
+        createDirecotries(paths, 0);
+    }
+
+    public void createDirecotries(String[] paths, int crtCopy) {
+        (new File(paths[0] + getPathDirecotry(Directory.SSJOINOUT, 0))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_R, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_S, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDS_R, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.RECORDS_S, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.TOKENS, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R, crtCopy))).mkdir();
+        (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R_AQL, crtCopy))).mkdir();
+    }
+
+    public abstract String getName();
+
+    public abstract int getNoRecords();
+
+    public abstract String getPath();
+
+    public String getPathDirecotry(Directory directory, int crtCopy) {
+        return getPathDirectory(getPath(), directory, crtCopy);
+    }
+
+    private String getPathDirectory(Directory directory, int crtCopy, boolean expected) {
+        return getPathDirectory(getName() + (expected ? ".expected" : ""), directory, crtCopy);
+    }
+
+    public String getPathDirectory(String path, Directory directory, int crtCopy) {
+        path += '/';
+        switch (directory) {
+            case SSJOININ:
+                path += AbstractDataset.PATH_SSJOININ;
+                break;
+            case SSJOINOUT:
+                path += AbstractDataset.PATH_SSJOINOUT;
+                break;
+            case RAW_R:
+                path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.R);
+                break;
+            case RAW_S:
+                path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.S);
+                break;
+            case RECORDSBULK_R:
+                path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.R);
+                break;
+            case RECORDSBULK_S:
+                path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.S);
+                break;
+            case RECORDS_R:
+                path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.R);
+                break;
+            case RECORDS_S:
+                path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.S);
+                break;
+            case TOKENS:
+                path += AbstractDataset.PATH_TOKENS;
+                break;
+            case TOKENS_R:
+                path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R);
+                break;
+            case TOKENS_R_AQL:
+                path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R) + "." + AQL;
+                break;
+            case RIDPAIRS:
+                path += AbstractDataset.PATH_RIDPAIRS;
+                break;
+            case RECORDPAIRS:
+                path += AbstractDataset.PATH_RECORDPAIRS;
+                break;
+            default:
+                throw new NoSuchElementException();
+        }
+        return path + "-" + String.format(DIRECTORY_ID_FORMAT, crtCopy);
+    }
+
+    public String getPathExpected(Directory directory) {
+        return getPathDirectory(directory, 0, true) + '/' + FILE_EXPECTED;
+    }
+
+    public String getPathPart(Directory directory, int crtCopy) {
+        return getPathDirecotry(directory, crtCopy) + '/' + FILE_PART;
+    }
+
+    public String getPathPart0(Directory directory) {
+        return getPathDirectory(directory, 0, false) + '/' + FILE_PART0;
+    }
+
+    public String getPathPart0(Directory directory, boolean expected) {
+        return getPathDirectory(directory, 0, expected) + '/' + (expected ? FILE_EXPECTED : FILE_PART0);
+    }
+
+    public abstract String getSuffix(Relation relation);
+
+    public abstract float getThreshold();
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java
new file mode 100644
index 0000000..5333cad
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java

@@ -0,0 +1,5 @@
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public abstract class AbstractTokenizableDataset extends AbstractDataset {
+    public abstract String getRecordData();
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java
new file mode 100644
index 0000000..3783829
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java

@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+
+public class DBLPDataset extends PublicationsDataset {
+    private static final String NAME = "dblp";
+    private static final int NO_RECORDS = 1268017;
+    private static final float THRESHOLD = .8f;
+    private static final String RECORD_DATA = "2,3";
+
+    public DBLPDataset() {
+        super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, NAME, NAME);
+    }
+
+    public DBLPDataset(String recordData) {
+        super(NAME, NO_RECORDS, THRESHOLD, recordData, NAME, NAME);
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java
new file mode 100644
index 0000000..5eaebd2
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java

@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class DBLPSmallDataset extends PublicationsDataset {
+    public DBLPSmallDataset() {
+        super("dblp-small", 100, .5f, "2,3", "dblp", "dblp");
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java
new file mode 100644
index 0000000..38aad37
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java

@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArrayBagSmallDataset extends AbstractDataset {
+    private final int NO_RECORDS = 4;
+    private final String NAME = "intarray-bag-small";
+    private final String PATH = NAME;
+    private final float THRESHOLD = .5f;
+
+    public IntArrayBagSmallDataset() {
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return NO_RECORDS;
+    }
+
+    @Override
+    public String getPath() {
+        return PATH;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        return "r";
+    }
+
+    @Override
+    public float getThreshold() {
+        return THRESHOLD;
+    }
+
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java
new file mode 100644
index 0000000..7c8c80d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java

@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArraySetSmallDataset extends AbstractDataset {
+    private final int NO_RECORDS = 4;
+    private final String NAME = "intarray-set-small";
+    private final String PATH = NAME;
+    private final float THRESHOLD = .5f;
+
+    public IntArraySetSmallDataset() {
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return NO_RECORDS;
+    }
+
+    @Override
+    public String getPath() {
+        return PATH;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        return "r";
+    }
+
+    @Override
+    public float getThreshold() {
+        return THRESHOLD;
+    }
+
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java
new file mode 100644
index 0000000..e8c2f2a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java

@@ -0,0 +1,41 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBDataset extends PublicationsDataset {
+    private static final String DBLP_SUFFIX = "dblp";
+    private static final String CSX_SUFFIX = "csx";
+    private static final String NAME = "pub";
+    private static final int NO_RECORDS = 1385532;
+    private static final float THRESHOLD = .8f;
+    private static final String RECORD_DATA = "2,3";
+
+    public PUBDataset() {
+        super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+    }
+
+    public PUBDataset(float threshold) {
+        super(NAME, NO_RECORDS, threshold, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+    }
+
+    public PUBDataset(float threshold, String recordData) {
+        super(NAME, NO_RECORDS, threshold, recordData, DBLP_SUFFIX, CSX_SUFFIX);
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java
new file mode 100644
index 0000000..eed28e4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java

@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBSmallDataset extends PublicationsDataset {
+    public PUBSmallDataset() {
+        super("pub-small", 100, .5f, "2,3", "dblp", "csx");
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java
new file mode 100644
index 0000000..e1653cd
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java

@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class PublicationsDataset extends AbstractTokenizableDataset {
+    protected final String name;
+    protected final String path;
+    protected final int noRecords;
+    protected final float threshold;
+    protected final String recordData;
+    protected final String rSuffix, sSuffix;
+
+    public PublicationsDataset(String name, int noRecords, float threshold, String recordData, String rSuffix,
+            String sSuffix) {
+        this.name = name;
+        this.noRecords = noRecords;
+        this.threshold = threshold;
+        this.recordData = recordData;
+        this.rSuffix = rSuffix;
+        this.sSuffix = sSuffix;
+
+        path = name;
+    }
+
+    @Override
+    public String getName() {
+        return name;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return noRecords;
+    }
+
+    @Override
+    public String getPath() {
+        return path;
+    }
+
+    @Override
+    public String getRecordData() {
+        return recordData;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        switch (relation) {
+            case R:
+                return rSuffix;
+            case S:
+                return sSuffix;
+            default:
+                throw new NoSuchElementException();
+        }
+    }
+
+    @Override
+    public float getThreshold() {
+        return threshold;
+    }
+}

diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java
new file mode 100644
index 0000000..6463b2d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java

@@ -0,0 +1,67 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class UsersVisitorsSmallDataset extends AbstractDataset {
+    private final int NO_RECORDS = 4;
+    private final String NAME = "users-visitors-small";
+    private static final String USERS_SUFFIX = "users";
+    private static final String VISITORS_SUFFIX = "visitors";
+    private final String PATH = NAME;
+    private final float THRESHOLD = .5f;
+
+    public UsersVisitorsSmallDataset() {
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public int getNoRecords() {
+        return NO_RECORDS;
+    }
+
+    @Override
+    public String getPath() {
+        return PATH;
+    }
+
+    @Override
+    public String getSuffix(Relation relation) {
+        switch (relation) {
+            case R:
+                return USERS_SUFFIX;
+            case S:
+                return VISITORS_SUFFIX;
+            default:
+                throw new NoSuchElementException();
+        }
+    }
+
+    @Override
+    public float getThreshold() {
+        return THRESHOLD;
+    }
+
+}

diff --git a/asterix-fuzzyjoin/src/test/scripts/conf.sh b/asterix-fuzzyjoin/src/test/scripts/conf.sh
new file mode 100644
index 0000000..45e962c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/conf.sh

@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+### http://www.cse.unsw.edu.au/~weiw/project/simjoin.html
+SSJOIN=/home/rares/workspace/ssjoin-bin
+DATA=../data
+
+IN=ssjoin.in-000
+OUT=ssjoin.out-000

diff --git a/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh
new file mode 100755
index 0000000..0cd6ccc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh

@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1                   # Required number of arguments
+E_BADARGS=85             # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+  echo "Usage:   `basename $0` dataset"
+  echo "Example: `basename $0` dblp-small"
+  exit $E_BADARGS
+fi
+
+THR="0.80"
+if [ "$1" == "dblp-small" ]; then
+    THR="0.50"
+fi
+
+
+mkdir $DATA/$1.expected/$OUT
+$SSJOIN/ppjoinplus j $THR $DATA/$1/$IN/part-00000 | \
+    sed 's/0\.812/0\.813/' | \
+    sort > $DATA/$1.expected/$OUT/expected.txt
+
+mkdir $DATA/$1/$OUT
+java \
+    -Xmx2g \
+    -jar $DIR/../../../target/fuzzyjoin-core-0.0.1.jar \
+    $THR $DATA/$1/$IN/part-00000 | \
+    sort > $DATA/$1/$OUT/part-00000
+
+diff $DATA/$1.expected/$OUT/expected.txt $DATA/$1/$OUT/part-00000

diff --git a/asterix-fuzzyjoin/src/test/scripts/inmemory.sh b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh
new file mode 100755
index 0000000..c9394d4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh

@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DATA="/home/rares/data/fuzzyjoin/dblp/records-024"
+FUZZYJOIN="/home/rares/fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar"
+
+echo "-- - Step 0: Project and append length - --"
+
+# java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinAppendLength $DATA/part-00000 $DATA/part-00000-len
+
+date
+
+echo "== START =="
+
+echo "-- - Step 1: Sort by length - --"
+
+# time sort -n -k 5 -t ":" $DATA/part-00000-len > $DATA/part-00000-len-sorted
+
+echo "-- - Step 2: Tokenize - --"
+
+# time java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinTokenize $DATA/part-00000-len-sorted $DATA/part-00000-tokens $DATA/part-00000-tokenized
+
+echo "-- - Step 3: RID pairs - --"
+
+time java -Xmx8g -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinMemory .8 $DATA/part-00000-tokenized > $DATA/part-00000-ridpairs
+
+echo "== END =="
+
+date
+
+
+### SSJoin ###
+# cut -d ":" -f 3,4 records-000/part-0000* > ! ssjoin.raw-000/part-00000
+# ~/workspace/ssjoin-bin/txtformat ssjoin.raw-000/part-00000 ssjoin.norm-000/part-00000 l
+# sed 's/_\+/ /g' ssjoin.norm-000/part-00000 > ! ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/tokenizer ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/ppjoinplus j .8 ssjoin.space-000/part-00000.bin > /dev/null
+# java -jar /fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar .8 ssjoin.space-000/part-00000.bin > /dev/null

diff --git a/asterix-fuzzyjoin/src/test/scripts/tokenize.sh b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh
new file mode 100755
index 0000000..5498d44
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh

@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+# 
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1                   # Required number of arguments
+E_BADARGS=85             # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+  echo "Usage:   `basename $0` dataset"
+  echo "Example: `basename $0` dblp-small"
+  exit $E_BADARGS
+fi
+
+$SSJOIN/tokenizer $DATA/$1/raw-000/part-00000 $2
+mkdir $DATA/$1/$IN
+mv $DATA/$1/raw-000/part-00000.bin $DATA/$1/$IN/part-00000
+
commit	82838a2cc0c73a40912a5e9fb4b5cd2027ba7d21	[log] [tgz]
author	icetindil <icetindil@gmail.com>	Fri Oct 11 16:41:18 2013 -0700
committer	icetindil <icetindil@gmail.com>	Wed Jan 29 00:32:16 2014 -0800
tree	efa719584a01ff3ac25015f7cd90624b8247f2e8
parent	aa89009750bf17e24d07883f61fe0c0296f4d3a3 [diff]