Merge branch 'genomix/fullstack_genomix' of https://code.google.com/p/hyracks into genomix/fullstack_genomix
diff --git a/genomix/genomix-driver/randomdata.sh b/genomix/genomix-driver/randomdata.sh
new file mode 100755
index 0000000..eaa234f
--- /dev/null
+++ b/genomix/genomix-driver/randomdata.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+#set -o pipefail
+#set -e
+set -x
+
+if [ $# -ne 5 ]; then
+    echo "please provide 5 parameters: infile.readids numlines numfiles outdir and cmd" 
+    echo "for example:   $0 /data/users/anbangx/testdata/5k_assemblathon_readids/5k_assemblathon.readids 100 5 ~/subset  \"bin/genomix -kmerLength 55 -localOutput ~/result/500k_reads_P4 -pipelineOrder BUILD_HADOOP,MERGE -localInput \""
+    exit 1
+fi
+
+INFILE=$1
+NUMLINES=$2
+NUM_FILES=$3
+OUTDIR=$4
+CMD=$5
+
+rm -rf $OUTDIR
+
+for i in `seq 1 $NUM_FILES`;
+do
+    mkdir -p $OUTDIR/random_set_$i || (echo "chouldn't make the output dir $OUTDIR/random_set_$i" && exit 1)
+    sort -R $INFILE | head -n $NUMLINES > $OUTDIR/random_set_$i/random.readid
+    eval "$CMD $OUTDIR/random_set_$i/random.readid" 2>&1 | tee $OUTDIR/logs/random_set_$i
+    if [ ${PIPESTATUS[0]} -eq 0 ]; then
+        rm -rf $OUTDIR/logs/random_set_$i
+    fi
+done
+
+
+
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java
index ba028a1..564d972 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java
@@ -57,7 +57,7 @@
         //InputFormat and OutputFormat for Reducer
         conf.setInputFormat(NLineInputFormat.class);
         conf.setInt("mapred.line.input.format.linespermap", linesPerMap);
-        conf.setInt("io.sort.mb", 150);
+//        conf.setInt("io.sort.mb", 150);
         if (seqOutput == true)
             conf.setOutputFormat(SequenceFileOutputFormat.class);
         else
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java
index ecf2a33..7d0655a 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java
@@ -161,8 +161,8 @@
     }
 
     public static void main(String[] args) throws Exception {
-        //        String[] myArgs = { "-hdfsInput", "/home/nanz1/TestData", "-hdfsOutput", "/home/hadoop/pairoutput",
-        //                "-kmerLength", "55", "-ip", "128.195.14.113", "-port", "3099", "-frameSize", "252"};
+//                String[] myArgs = { "-hdfsInput", "/home/nanz1/TestData", "-hdfsOutput", "/home/hadoop/pairoutput",
+//                        "-kmerLength", "55", "-ip", "128.195.14.113", "-port", "3099", "-hyracksBuildOutputText", "true"};
         GenomixJobConf jobConf = GenomixJobConf.fromArguments(args);
 
         String ipAddress = jobConf.get(GenomixJobConf.IP_ADDRESS);
@@ -170,7 +170,8 @@
         String IODirs = jobConf.get(GenomixJobConf.HYRACKS_IO_DIRS, null);
         int numOfDuplicate = IODirs != null ? IODirs.split(",").length : 4;
         boolean bProfiling = jobConf.getBoolean(GenomixJobConf.PROFILE, true);
-        if (Boolean.getBoolean(jobConf.get(GenomixJobConf.HYRACKS_BUILD_OUTPUT_TEXT)))
+        
+        if (Boolean.parseBoolean(jobConf.get(GenomixJobConf.HYRACKS_BUILD_OUTPUT_TEXT)))
             jobConf.set(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_TEXT);
         else
             jobConf.set(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_BINARY);
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java
index 611bd89..b3ab39f 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java
@@ -76,7 +76,8 @@
     /**

      * Where is your dot program located? It will be called externally.

      */

-    private static String DOT = "/usr/local/bin/dot"; // Linux

+    private static String DOT = "dot";

+//    private static String DOT = "/usr/local/bin/dot"; // Linux

     //   private static String DOT = "c:/Program Files/Graphviz2.26.3/bin/dot.exe";	// Windows

 

     /**

diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java
index beb41f3..816eabd 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java
@@ -20,6 +20,8 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.conf.Configuration;
@@ -47,7 +49,7 @@
 @SuppressWarnings("deprecation")
 @RunWith(value = Parameterized.class)
 public class ParameteredTestCaseForSet {
-    public static final DirType testSetType = DirType.SINGLEREAD;
+    public static final DirType testSetType = DirType.MULTIPLE_TANDEM_REPEAT;
     public String dataPath;
     public int KmerSize;
     
@@ -64,10 +66,12 @@
         try {
             dirSet = ts.getAllTestInputinDir();
             for (String testDirPointer : dirSet) {
-                String[] paraForSTest = testDirPointer.split("_");
-                if(paraForSTest.length != 2)
-                    throw new Exception("the number of paramters is not enough");
-                data.add(new Object[] { testDirPointer, paraForSTest[1].substring(1)});
+                Pattern kPattern = Pattern.compile("_k(\\d+)");
+                Matcher m = kPattern.matcher(testDirPointer);
+                boolean found = m.find();
+                if (!found)
+                    throw new Exception("the number of parameters is not enough");
+                data.add(new Object[] { testDirPointer, m.group(1)});
             }
         } catch (IOException e) {
             e.printStackTrace();
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java
index bbbedfd..7c275c4 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java
@@ -16,6 +16,7 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
 
 public class TestSet {
 
@@ -27,6 +28,7 @@
     public final String BRIDGE = "bridge";
     public final String RANDOM = "random";
     public final String SINGLEREAD = "singleread";
+    public final String MULTIPLE_TANDEM_REPEAT = "multi_tandem_repeat";
     
     public final String[] SRSET = { "HighSplitRepeat", "MidSplitRepeat", "LowSplitRepeat" };
     public final String[] TIPSET = { "Tips1", "Tips2", "Tips3", "Tips4" };
@@ -38,7 +40,7 @@
         SPLITREPEAT,
         BRIDGE,
         RANDOM,
-        SINGLEREAD
+        SINGLEREAD, MULTIPLE_TANDEM_REPEAT
     }
 
     private DirType testSet;
@@ -63,8 +65,12 @@
                 break;
             case RANDOM:
                 testSet = patternType.RANDOM;
+                break;
             case SINGLEREAD:
                 testSet = patternType.SINGLEREAD;
+                break;
+            case MULTIPLE_TANDEM_REPEAT:
+                testSet = patternType.MULTIPLE_TANDEM_REPEAT;
         }
     }
 
@@ -108,19 +114,20 @@
                 return detectAllTestSet(PREFIX + RANDOM);
             case SINGLEREAD:
                 return detectAllTestSet(PREFIX + SINGLEREAD);
+            case MULTIPLE_TANDEM_REPEAT:
+                return detectAllTestSet(PREFIX + MULTIPLE_TANDEM_REPEAT);
         }
         return null;
     }
 
     private String[] detectAllTestSet(String inputPrefix) throws IOException {
         File src = new File(inputPrefix);
-        String[] output = new String[src.listFiles().length - 1];
-        int i = 0;
+        ArrayList<String> outArray = new ArrayList<String>(src.listFiles().length);
         for (File f : src.listFiles()) {
             if (!f.getName().contains(".DS_Store")){
-                output[i++] = f.getPath().toString();
+                outArray.add(f.getPath().toString());
             }
         }
-        return output;
+        return outArray.toArray(new String[outArray.size()]);
     }
 }
diff --git a/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat2_k3/repeat.txt b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat2_k3/repeat.txt
new file mode 100644
index 0000000..00b443b
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat2_k3/repeat.txt
@@ -0,0 +1 @@
+1	ATCGCGCAT
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat3_k3/repeat.txt b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat3_k3/repeat.txt
new file mode 100644
index 0000000..5a8db3a
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat3_k3/repeat.txt
@@ -0,0 +1 @@
+1	CGCGCGCATCGCGCCGCGCGCGGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCCG
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat_k3/repeat.txt b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat_k3/repeat.txt
new file mode 100644
index 0000000..2b9ee5b
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat_k3/repeat.txt
@@ -0,0 +1 @@
+1	CGCGC
\ No newline at end of file