adding fuzzyjoin code to git
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java
new file mode 100644
index 0000000..f5a8cec9
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTest.java
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.util.ArrayList;
+
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.FuzzyJoinMemory;
+import edu.uci.ics.asterix.fuzzyjoin.ResultSelfJoin;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.AbstractDataset.Directory;
+import edu.uci.ics.asterix.fuzzyjoin.tests.dataset.DBLPSmallDataset;
+
+public class FuzzyJoinTest {
+
+ private static final AbstractDataset dataset = new DBLPSmallDataset();
+ private static final String base = "data/";
+
+ @Test
+ public void test() throws Exception {
+
+ ArrayList<int[]> records = new ArrayList<int[]>();
+ ArrayList<Integer> rids = new ArrayList<Integer>();
+ ArrayList<ResultSelfJoin> results = new ArrayList<ResultSelfJoin>();
+
+ dataset.createDirecotries(new String[] { base });
+
+ FuzzyJoinMemory fj = new FuzzyJoinMemory(dataset.getThreshold());
+
+ FuzzyJoinMemory.readRecords(base + dataset.getPathPart0(Directory.SSJOININ), records, rids);
+
+ for (int[] record : records) {
+ results.addAll(fj.selfJoinAndAddRecord(record));
+ }
+
+ BufferedWriter out = new BufferedWriter(new FileWriter(base + dataset.getPathPart0(Directory.SSJOINOUT)));
+ for (ResultSelfJoin result : results) {
+ out.write(String.format("%d %d %.3f\n", rids.get(result.indexX), rids.get(result.indexY), result.similarity));
+ }
+ out.close();
+
+ FuzzyJoinTestUtil.verifyDirectory(base + dataset.getPathPart0(Directory.SSJOINOUT),
+ base + dataset.getPathExpected(Directory.SSJOINOUT));
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java
new file mode 100644
index 0000000..db44850
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/FuzzyJoinTestUtil.java
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashSet;
+
+import org.junit.Assert;
+
+public class FuzzyJoinTestUtil {
+
+ public static void verifyDirectory(String pathTest, String pathCorrect)
+ throws IOException {
+ verifyDirectory(pathTest, pathCorrect, false);
+ }
+
+ public static void verifyDirectory(String pathTest, String pathCorrect,
+ boolean noDup) throws IOException {
+ int countTest = 0, countTestDedup = 0, countCorrect = 0;
+
+ BufferedReader input;
+ String line;
+ HashSet<String> buffer = new HashSet<String>();
+
+ // buffer Test
+ input = new BufferedReader(new FileReader(pathTest));
+ while ((line = input.readLine()) != null) {
+ buffer.add(line);
+ countTest++;
+ }
+ countTestDedup = buffer.size();
+
+ // probe Correct
+ input = new BufferedReader(new FileReader(new File(pathCorrect)));
+ while ((line = input.readLine()) != null) {
+ Assert.assertTrue(buffer.contains(line));
+ countCorrect++;
+ }
+
+ // check counts
+ Assert.assertEquals(countTestDedup, countCorrect);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
new file mode 100644
index 0000000..e65bb25
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
@@ -0,0 +1,239 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8NGramTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8NGramTokenFactory;
+
+public class NGramTokenizerTest {
+
+ private char PRECHAR = '#';
+ private char POSTCHAR = '$';
+
+ private String str = "Jürgen S. Generic's Car";
+ private byte[] inputBuffer;
+
+ private int gramLength = 3;
+
+ private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
+
+ String tmp = s.toLowerCase();
+ if (prePost) {
+ StringBuilder preBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ preBuilder.append(PRECHAR);
+ }
+ String pre = preBuilder.toString();
+
+ StringBuilder postBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ postBuilder.append(POSTCHAR);
+ }
+ String post = postBuilder.toString();
+
+ tmp = pre + s.toLowerCase() + post;
+ }
+
+ for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
+ String gram = tmp.substring(i, i + gramLength);
+ grams.add(gram);
+ }
+ }
+
+ @Before
+ public void init() throws Exception {
+ // serialize string into bytes
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput dos = new DataOutputStream(baos);
+ dos.writeUTF(str);
+ inputBuffer = baos.toByteArray();
+ }
+
+ void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
+ false, tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
+ for (String s : expectedGrams) {
+ Integer count = gramCounts.get(s);
+ if (count == null) {
+ count = 1;
+ gramCounts.put(s, count);
+ } else {
+ count++;
+ }
+
+ int hash = tokenHash(s, count);
+ expectedHashedGrams.add(hash);
+ }
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedGram = in.readInt();
+
+ // System.out.println(hashedGram);
+
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
+
+ void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ for (String s : expectedGrams) {
+ int hash = tokenHash(s, 1);
+ expectedHashedGrams.add(hash);
+ }
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedGram = in.readInt();
+
+ // System.out.println(hashedGram);
+
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
+
+ void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
+ UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ String strGram = in.readUTF();
+
+ // System.out.println("\"" + strGram + "\"");
+
+ Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
+
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
+
+ @Test
+ public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
+ }
+
+ @Test
+ public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithHashedUTF8Tokens(true);
+ }
+
+ @Test
+ public void testNGramTokenizerWithUTF8Tokens() throws IOException {
+ runTestNGramTokenizerWithUTF8Tokens(false);
+ runTestNGramTokenizerWithUTF8Tokens(true);
+ }
+
+ public int tokenHash(String token, int tokenCount) {
+ int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+ for (int i = 0; i < token.length(); i++) {
+ h ^= token.charAt(i);
+ h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+ }
+ return h + tokenCount;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java
new file mode 100644
index 0000000..8fd05da
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/WordTokenizerTest.java
@@ -0,0 +1,214 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.HashedUTF8WordTokenFactory;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.asterix.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
+
+public class WordTokenizerTest {
+
+ private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
+ private byte[] inputBuffer;
+
+ private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
+ private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
+ private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
+
+ @Before
+ public void init() throws IOException {
+ // serialize text into bytes
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput dos = new DataOutputStream(baos);
+ dos.writeUTF(text);
+ inputBuffer = baos.toByteArray();
+
+ // init expected string tokens
+ expectedUTF8Tokens.add("hello");
+ expectedUTF8Tokens.add("world");
+ expectedUTF8Tokens.add("i");
+ expectedUTF8Tokens.add("would");
+ expectedUTF8Tokens.add("like");
+ expectedUTF8Tokens.add("to");
+ expectedUTF8Tokens.add("inform");
+ expectedUTF8Tokens.add("you");
+ expectedUTF8Tokens.add("of");
+ expectedUTF8Tokens.add("the");
+ expectedUTF8Tokens.add("importance");
+ expectedUTF8Tokens.add("of");
+ expectedUTF8Tokens.add("foo");
+ expectedUTF8Tokens.add("bar");
+ expectedUTF8Tokens.add("yes");
+ expectedUTF8Tokens.add("foo");
+ expectedUTF8Tokens.add("bar");
+ expectedUTF8Tokens.add("jürgen");
+
+ // hashed tokens ignoring token count
+ for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+ int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
+ expectedHashedUTF8Tokens.add(hash);
+ }
+
+ // hashed tokens using token count
+ HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
+ for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+ Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
+ if (count == null) {
+ count = 1;
+ tokenCounts.put(expectedUTF8Tokens.get(i), count);
+ } else {
+ count++;
+ }
+
+ int hash = tokenHash(expectedUTF8Tokens.get(i), count);
+ expectedCountedHashedUTF8Tokens.add(hash);
+ }
+ }
+
+ @Test
+ public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
+
+ HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
+ tokenFactory);
+
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedToken = in.readInt();
+
+ // System.out.println(hashedToken);
+
+ Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
+
+ tokenCount++;
+ }
+ }
+
+ @Test
+ public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
+
+ HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ Integer hashedToken = in.readInt();
+
+ // System.out.println(hashedToken);
+
+ Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
+
+ tokenCount++;
+ }
+ }
+
+ @Test
+ public void testWordTokenizerWithUTF8Tokens() throws IOException {
+
+ UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+ int tokenCount = 0;
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+
+ // serialize hashed token
+ ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+ DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenDos);
+
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ DataInput in = new DataInputStream(bais);
+
+ String strToken = in.readUTF();
+
+ // System.out.println(strToken);
+
+ Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
+
+ tokenCount++;
+ }
+ }
+
+ // JAQL
+ public int tokenHash(String token, int tokenCount) {
+ int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+ for (int i = 0; i < token.length(); i++) {
+ h ^= token.charAt(i);
+ h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+ }
+ return h + tokenCount;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java
new file mode 100644
index 0000000..5ca6c6d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractDataset.java
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.io.File;
+import java.util.NoSuchElementException;
+
+public abstract class AbstractDataset {
+ public static enum Directory {
+ RAW_R,
+ RAW_S,
+ RECORDPAIRS,
+ RECORDS_R,
+ RECORDS_S,
+ RECORDSBULK_R,
+ RECORDSBULK_S,
+ RIDPAIRS,
+ SSJOININ,
+ SSJOINOUT,
+ TOKENS,
+ TOKENS_R,
+ TOKENS_R_AQL,
+ }
+
+ public static enum Relation {
+ R, S,
+ }
+
+ public static final String FILE_PART = "part-";
+ public static final String FILE_PART0 = FILE_PART + "00000";
+ public static final String FILE_EXPECTED = "expected.txt";
+ public static final String AQL = "aql";
+
+ public static final String PATH_RAW = "raw";
+ public static final String PATH_RECORDPAIRS = "recordpairs";
+ public static final String PATH_RECORDS = "records";
+ public static final String PATH_RECORDSBULK = "recordsbulk";
+ public static final String PATH_RIDPAIRS = "ridpairs";
+ public static final String PATH_SSJOININ = "ssjoin.in";
+ public static final String PATH_SSJOINOUT = "ssjoin.out";
+ public static final String PATH_TOKENS = "tokens";
+
+ public static final String DIRECTORY_ID_FORMAT = "%03d";
+
+ public void createDirecotries(String[] paths) {
+ createDirecotries(paths, 0);
+ }
+
+ public void createDirecotries(String[] paths, int crtCopy) {
+ (new File(paths[0] + getPathDirecotry(Directory.SSJOINOUT, 0))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_R, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDSBULK_S, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDS_R, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.RECORDS_S, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.TOKENS, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R, crtCopy))).mkdir();
+ (new File(paths[0] + getPathDirecotry(Directory.TOKENS_R_AQL, crtCopy))).mkdir();
+ }
+
+ public abstract String getName();
+
+ public abstract int getNoRecords();
+
+ public abstract String getPath();
+
+ public String getPathDirecotry(Directory directory, int crtCopy) {
+ return getPathDirectory(getPath(), directory, crtCopy);
+ }
+
+ private String getPathDirectory(Directory directory, int crtCopy, boolean expected) {
+ return getPathDirectory(getName() + (expected ? ".expected" : ""), directory, crtCopy);
+ }
+
+ public String getPathDirectory(String path, Directory directory, int crtCopy) {
+ path += '/';
+ switch (directory) {
+ case SSJOININ:
+ path += AbstractDataset.PATH_SSJOININ;
+ break;
+ case SSJOINOUT:
+ path += AbstractDataset.PATH_SSJOINOUT;
+ break;
+ case RAW_R:
+ path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.R);
+ break;
+ case RAW_S:
+ path += AbstractDataset.PATH_RAW + "." + getSuffix(Relation.S);
+ break;
+ case RECORDSBULK_R:
+ path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.R);
+ break;
+ case RECORDSBULK_S:
+ path += AbstractDataset.PATH_RECORDSBULK + "." + getSuffix(Relation.S);
+ break;
+ case RECORDS_R:
+ path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.R);
+ break;
+ case RECORDS_S:
+ path += AbstractDataset.PATH_RECORDS + "." + getSuffix(Relation.S);
+ break;
+ case TOKENS:
+ path += AbstractDataset.PATH_TOKENS;
+ break;
+ case TOKENS_R:
+ path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R);
+ break;
+ case TOKENS_R_AQL:
+ path += AbstractDataset.PATH_TOKENS + "." + getSuffix(Relation.R) + "." + AQL;
+ break;
+ case RIDPAIRS:
+ path += AbstractDataset.PATH_RIDPAIRS;
+ break;
+ case RECORDPAIRS:
+ path += AbstractDataset.PATH_RECORDPAIRS;
+ break;
+ default:
+ throw new NoSuchElementException();
+ }
+ return path + "-" + String.format(DIRECTORY_ID_FORMAT, crtCopy);
+ }
+
+ public String getPathExpected(Directory directory) {
+ return getPathDirectory(directory, 0, true) + '/' + FILE_EXPECTED;
+ }
+
+ public String getPathPart(Directory directory, int crtCopy) {
+ return getPathDirecotry(directory, crtCopy) + '/' + FILE_PART;
+ }
+
+ public String getPathPart0(Directory directory) {
+ return getPathDirectory(directory, 0, false) + '/' + FILE_PART0;
+ }
+
+ public String getPathPart0(Directory directory, boolean expected) {
+ return getPathDirectory(directory, 0, expected) + '/' + (expected ? FILE_EXPECTED : FILE_PART0);
+ }
+
+ public abstract String getSuffix(Relation relation);
+
+ public abstract float getThreshold();
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java
new file mode 100644
index 0000000..5333cad
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/AbstractTokenizableDataset.java
@@ -0,0 +1,5 @@
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public abstract class AbstractTokenizableDataset extends AbstractDataset {
+ public abstract String getRecordData();
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java
new file mode 100644
index 0000000..3783829
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPDataset.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+
+public class DBLPDataset extends PublicationsDataset {
+ private static final String NAME = "dblp";
+ private static final int NO_RECORDS = 1268017;
+ private static final float THRESHOLD = .8f;
+ private static final String RECORD_DATA = "2,3";
+
+ public DBLPDataset() {
+ super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, NAME, NAME);
+ }
+
+ public DBLPDataset(String recordData) {
+ super(NAME, NO_RECORDS, THRESHOLD, recordData, NAME, NAME);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java
new file mode 100644
index 0000000..5eaebd2
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/DBLPSmallDataset.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class DBLPSmallDataset extends PublicationsDataset {
+ public DBLPSmallDataset() {
+ super("dblp-small", 100, .5f, "2,3", "dblp", "dblp");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java
new file mode 100644
index 0000000..38aad37
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArrayBagSmallDataset.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArrayBagSmallDataset extends AbstractDataset {
+ private final int NO_RECORDS = 4;
+ private final String NAME = "intarray-bag-small";
+ private final String PATH = NAME;
+ private final float THRESHOLD = .5f;
+
+ public IntArrayBagSmallDataset() {
+ }
+
+ @Override
+ public String getName() {
+ return NAME;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return NO_RECORDS;
+ }
+
+ @Override
+ public String getPath() {
+ return PATH;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ return "r";
+ }
+
+ @Override
+ public float getThreshold() {
+ return THRESHOLD;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java
new file mode 100644
index 0000000..7c8c80d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/IntArraySetSmallDataset.java
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class IntArraySetSmallDataset extends AbstractDataset {
+ private final int NO_RECORDS = 4;
+ private final String NAME = "intarray-set-small";
+ private final String PATH = NAME;
+ private final float THRESHOLD = .5f;
+
+ public IntArraySetSmallDataset() {
+ }
+
+ @Override
+ public String getName() {
+ return NAME;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return NO_RECORDS;
+ }
+
+ @Override
+ public String getPath() {
+ return PATH;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ return "r";
+ }
+
+ @Override
+ public float getThreshold() {
+ return THRESHOLD;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java
new file mode 100644
index 0000000..e8c2f2a
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBDataset.java
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBDataset extends PublicationsDataset {
+ private static final String DBLP_SUFFIX = "dblp";
+ private static final String CSX_SUFFIX = "csx";
+ private static final String NAME = "pub";
+ private static final int NO_RECORDS = 1385532;
+ private static final float THRESHOLD = .8f;
+ private static final String RECORD_DATA = "2,3";
+
+ public PUBDataset() {
+ super(NAME, NO_RECORDS, THRESHOLD, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+ }
+
+ public PUBDataset(float threshold) {
+ super(NAME, NO_RECORDS, threshold, RECORD_DATA, DBLP_SUFFIX, CSX_SUFFIX);
+ }
+
+ public PUBDataset(float threshold, String recordData) {
+ super(NAME, NO_RECORDS, threshold, recordData, DBLP_SUFFIX, CSX_SUFFIX);
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java
new file mode 100644
index 0000000..eed28e4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PUBSmallDataset.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+public class PUBSmallDataset extends PublicationsDataset {
+ public PUBSmallDataset() {
+ super("pub-small", 100, .5f, "2,3", "dblp", "csx");
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java
new file mode 100644
index 0000000..e1653cd
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/PublicationsDataset.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class PublicationsDataset extends AbstractTokenizableDataset {
+ protected final String name;
+ protected final String path;
+ protected final int noRecords;
+ protected final float threshold;
+ protected final String recordData;
+ protected final String rSuffix, sSuffix;
+
+ public PublicationsDataset(String name, int noRecords, float threshold, String recordData, String rSuffix,
+ String sSuffix) {
+ this.name = name;
+ this.noRecords = noRecords;
+ this.threshold = threshold;
+ this.recordData = recordData;
+ this.rSuffix = rSuffix;
+ this.sSuffix = sSuffix;
+
+ path = name;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return noRecords;
+ }
+
+ @Override
+ public String getPath() {
+ return path;
+ }
+
+ @Override
+ public String getRecordData() {
+ return recordData;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ switch (relation) {
+ case R:
+ return rSuffix;
+ case S:
+ return sSuffix;
+ default:
+ throw new NoSuchElementException();
+ }
+ }
+
+ @Override
+ public float getThreshold() {
+ return threshold;
+ }
+}
diff --git a/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java
new file mode 100644
index 0000000..6463b2d
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/java/edu/uci/ics/asterix/fuzzyjoin/tests/dataset/UsersVisitorsSmallDataset.java
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.asterix.fuzzyjoin.tests.dataset;
+
+import java.util.NoSuchElementException;
+
+public class UsersVisitorsSmallDataset extends AbstractDataset {
+ private final int NO_RECORDS = 4;
+ private final String NAME = "users-visitors-small";
+ private static final String USERS_SUFFIX = "users";
+ private static final String VISITORS_SUFFIX = "visitors";
+ private final String PATH = NAME;
+ private final float THRESHOLD = .5f;
+
+ public UsersVisitorsSmallDataset() {
+ }
+
+ @Override
+ public String getName() {
+ return NAME;
+ }
+
+ @Override
+ public int getNoRecords() {
+ return NO_RECORDS;
+ }
+
+ @Override
+ public String getPath() {
+ return PATH;
+ }
+
+ @Override
+ public String getSuffix(Relation relation) {
+ switch (relation) {
+ case R:
+ return USERS_SUFFIX;
+ case S:
+ return VISITORS_SUFFIX;
+ default:
+ throw new NoSuchElementException();
+ }
+ }
+
+ @Override
+ public float getThreshold() {
+ return THRESHOLD;
+ }
+
+}
diff --git a/asterix-fuzzyjoin/src/test/scripts/conf.sh b/asterix-fuzzyjoin/src/test/scripts/conf.sh
new file mode 100644
index 0000000..45e962c
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/conf.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+### http://www.cse.unsw.edu.au/~weiw/project/simjoin.html
+SSJOIN=/home/rares/workspace/ssjoin-bin
+DATA=../data
+
+IN=ssjoin.in-000
+OUT=ssjoin.out-000
diff --git a/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh
new file mode 100755
index 0000000..0cd6ccc
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/fuzzyjoin.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1 # Required number of arguments
+E_BADARGS=85 # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+ echo "Usage: `basename $0` dataset"
+ echo "Example: `basename $0` dblp-small"
+ exit $E_BADARGS
+fi
+
+THR="0.80"
+if [ "$1" == "dblp-small" ]; then
+ THR="0.50"
+fi
+
+
+mkdir $DATA/$1.expected/$OUT
+$SSJOIN/ppjoinplus j $THR $DATA/$1/$IN/part-00000 | \
+ sed 's/0\.812/0\.813/' | \
+ sort > $DATA/$1.expected/$OUT/expected.txt
+
+mkdir $DATA/$1/$OUT
+java \
+ -Xmx2g \
+ -jar $DIR/../../../target/fuzzyjoin-core-0.0.1.jar \
+ $THR $DATA/$1/$IN/part-00000 | \
+ sort > $DATA/$1/$OUT/part-00000
+
+diff $DATA/$1.expected/$OUT/expected.txt $DATA/$1/$OUT/part-00000
diff --git a/asterix-fuzzyjoin/src/test/scripts/inmemory.sh b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh
new file mode 100755
index 0000000..c9394d4
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/inmemory.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DATA="/home/rares/data/fuzzyjoin/dblp/records-024"
+FUZZYJOIN="/home/rares/fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar"
+
+echo "-- - Step 0: Project and append length - --"
+
+# java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinAppendLength $DATA/part-00000 $DATA/part-00000-len
+
+date
+
+echo "== START =="
+
+echo "-- - Step 1: Sort by length - --"
+
+# time sort -n -k 5 -t ":" $DATA/part-00000-len > $DATA/part-00000-len-sorted
+
+echo "-- - Step 2: Tokenize - --"
+
+# time java -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinTokenize $DATA/part-00000-len-sorted $DATA/part-00000-tokens $DATA/part-00000-tokenized
+
+echo "-- - Step 3: RID pairs - --"
+
+time java -Xmx8g -cp $FUZZYJOIN edu.uci.ics.fuzzyjoin.FuzzyJoinMemory .8 $DATA/part-00000-tokenized > $DATA/part-00000-ridpairs
+
+echo "== END =="
+
+date
+
+
+### SSJoin ###
+# cut -d ":" -f 3,4 records-000/part-0000* > ! ssjoin.raw-000/part-00000
+# ~/workspace/ssjoin-bin/txtformat ssjoin.raw-000/part-00000 ssjoin.norm-000/part-00000 l
+# sed 's/_\+/ /g' ssjoin.norm-000/part-00000 > ! ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/tokenizer ssjoin.space-000/part-00000
+# ~/workspace/ssjoin-bin/ppjoinplus j .8 ssjoin.space-000/part-00000.bin > /dev/null
+# java -jar /fuzzyjoin/fuzzyjoin-core/target/fuzzyjoin-core-0.0.2-SNAPSHOT.jar .8 ssjoin.space-000/part-00000.bin > /dev/null
diff --git a/asterix-fuzzyjoin/src/test/scripts/tokenize.sh b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh
new file mode 100755
index 0000000..5498d44
--- /dev/null
+++ b/asterix-fuzzyjoin/src/test/scripts/tokenize.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright 2010-2011 The Regents of the University of California
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License. You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS"; BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+#
+# Author: Rares Vernica <rares (at) ics.uci.edu>
+
+DIR=`dirname $0`; if [ "${DIR:0:1}" == "." ]; then DIR=`pwd`"${DIR:1}"; fi
+source $DIR/conf.sh
+
+ARGS=1 # Required number of arguments
+E_BADARGS=85 # Wrong number of arguments passed to script.
+if [ $# -lt "$ARGS" ]
+then
+ echo "Usage: `basename $0` dataset"
+ echo "Example: `basename $0` dblp-small"
+ exit $E_BADARGS
+fi
+
+$SSJOIN/tokenizer $DATA/$1/raw-000/part-00000 $2
+mkdir $DATA/$1/$IN
+mv $DATA/$1/raw-000/part-00000.bin $DATA/$1/$IN/part-00000
+