Merge branch 'genomix/fullstack_genomix' of https://code.google.com/p/hyracks into genomix/fullstack_genomix
diff --git a/genomix/genomix-driver/randomdata.sh b/genomix/genomix-driver/randomdata.sh
new file mode 100755
index 0000000..eaa234f
--- /dev/null
+++ b/genomix/genomix-driver/randomdata.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+#set -o pipefail
+#set -e
+set -x
+
+if [ $# -ne 5 ]; then
+ echo "please provide 5 parameters: infile.readids numlines numfiles outdir and cmd"
+ echo "for example: $0 /data/users/anbangx/testdata/5k_assemblathon_readids/5k_assemblathon.readids 100 5 ~/subset \"bin/genomix -kmerLength 55 -localOutput ~/result/500k_reads_P4 -pipelineOrder BUILD_HADOOP,MERGE -localInput \""
+ exit 1
+fi
+
+INFILE=$1
+NUMLINES=$2
+NUM_FILES=$3
+OUTDIR=$4
+CMD=$5
+
+rm -rf $OUTDIR
+
+for i in `seq 1 $NUM_FILES`;
+do
+ mkdir -p $OUTDIR/random_set_$i || (echo "chouldn't make the output dir $OUTDIR/random_set_$i" && exit 1)
+ sort -R $INFILE | head -n $NUMLINES > $OUTDIR/random_set_$i/random.readid
+ eval "$CMD $OUTDIR/random_set_$i/random.readid" 2>&1 | tee $OUTDIR/logs/random_set_$i
+ if [ ${PIPESTATUS[0]} -eq 0 ]; then
+ rm -rf $OUTDIR/logs/random_set_$i
+ fi
+done
+
+
+
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java
index ba028a1..564d972 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixDriver.java
@@ -57,7 +57,7 @@
//InputFormat and OutputFormat for Reducer
conf.setInputFormat(NLineInputFormat.class);
conf.setInt("mapred.line.input.format.linespermap", linesPerMap);
- conf.setInt("io.sort.mb", 150);
+// conf.setInt("io.sort.mb", 150);
if (seqOutput == true)
conf.setOutputFormat(SequenceFileOutputFormat.class);
else
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java
index ecf2a33..7d0655a 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/driver/Driver.java
@@ -161,8 +161,8 @@
}
public static void main(String[] args) throws Exception {
- // String[] myArgs = { "-hdfsInput", "/home/nanz1/TestData", "-hdfsOutput", "/home/hadoop/pairoutput",
- // "-kmerLength", "55", "-ip", "128.195.14.113", "-port", "3099", "-frameSize", "252"};
+// String[] myArgs = { "-hdfsInput", "/home/nanz1/TestData", "-hdfsOutput", "/home/hadoop/pairoutput",
+// "-kmerLength", "55", "-ip", "128.195.14.113", "-port", "3099", "-hyracksBuildOutputText", "true"};
GenomixJobConf jobConf = GenomixJobConf.fromArguments(args);
String ipAddress = jobConf.get(GenomixJobConf.IP_ADDRESS);
@@ -170,7 +170,8 @@
String IODirs = jobConf.get(GenomixJobConf.HYRACKS_IO_DIRS, null);
int numOfDuplicate = IODirs != null ? IODirs.split(",").length : 4;
boolean bProfiling = jobConf.getBoolean(GenomixJobConf.PROFILE, true);
- if (Boolean.getBoolean(jobConf.get(GenomixJobConf.HYRACKS_BUILD_OUTPUT_TEXT)))
+
+ if (Boolean.parseBoolean(jobConf.get(GenomixJobConf.HYRACKS_BUILD_OUTPUT_TEXT)))
jobConf.set(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_TEXT);
else
jobConf.set(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_BINARY);
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java
index 611bd89..b3ab39f 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/graph/util/GraphViz.java
@@ -76,7 +76,8 @@
/**
* Where is your dot program located? It will be called externally.
*/
- private static String DOT = "/usr/local/bin/dot"; // Linux
+ private static String DOT = "dot";
+// private static String DOT = "/usr/local/bin/dot"; // Linux
// private static String DOT = "c:/Program Files/Graphviz2.26.3/bin/dot.exe"; // Windows
/**
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java
index beb41f3..816eabd 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/ParameteredTestCaseForSet.java
@@ -20,6 +20,8 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
@@ -47,7 +49,7 @@
@SuppressWarnings("deprecation")
@RunWith(value = Parameterized.class)
public class ParameteredTestCaseForSet {
- public static final DirType testSetType = DirType.SINGLEREAD;
+ public static final DirType testSetType = DirType.MULTIPLE_TANDEM_REPEAT;
public String dataPath;
public int KmerSize;
@@ -64,10 +66,12 @@
try {
dirSet = ts.getAllTestInputinDir();
for (String testDirPointer : dirSet) {
- String[] paraForSTest = testDirPointer.split("_");
- if(paraForSTest.length != 2)
- throw new Exception("the number of paramters is not enough");
- data.add(new Object[] { testDirPointer, paraForSTest[1].substring(1)});
+ Pattern kPattern = Pattern.compile("_k(\\d+)");
+ Matcher m = kPattern.matcher(testDirPointer);
+ boolean found = m.find();
+ if (!found)
+ throw new Exception("the number of parameters is not enough");
+ data.add(new Object[] { testDirPointer, m.group(1)});
}
} catch (IOException e) {
e.printStackTrace();
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java
index bbbedfd..7c275c4 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/graph/test/TestSet.java
@@ -16,6 +16,7 @@
import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
public class TestSet {
@@ -27,6 +28,7 @@
public final String BRIDGE = "bridge";
public final String RANDOM = "random";
public final String SINGLEREAD = "singleread";
+ public final String MULTIPLE_TANDEM_REPEAT = "multi_tandem_repeat";
public final String[] SRSET = { "HighSplitRepeat", "MidSplitRepeat", "LowSplitRepeat" };
public final String[] TIPSET = { "Tips1", "Tips2", "Tips3", "Tips4" };
@@ -38,7 +40,7 @@
SPLITREPEAT,
BRIDGE,
RANDOM,
- SINGLEREAD
+ SINGLEREAD, MULTIPLE_TANDEM_REPEAT
}
private DirType testSet;
@@ -63,8 +65,12 @@
break;
case RANDOM:
testSet = patternType.RANDOM;
+ break;
case SINGLEREAD:
testSet = patternType.SINGLEREAD;
+ break;
+ case MULTIPLE_TANDEM_REPEAT:
+ testSet = patternType.MULTIPLE_TANDEM_REPEAT;
}
}
@@ -108,19 +114,20 @@
return detectAllTestSet(PREFIX + RANDOM);
case SINGLEREAD:
return detectAllTestSet(PREFIX + SINGLEREAD);
+ case MULTIPLE_TANDEM_REPEAT:
+ return detectAllTestSet(PREFIX + MULTIPLE_TANDEM_REPEAT);
}
return null;
}
private String[] detectAllTestSet(String inputPrefix) throws IOException {
File src = new File(inputPrefix);
- String[] output = new String[src.listFiles().length - 1];
- int i = 0;
+ ArrayList<String> outArray = new ArrayList<String>(src.listFiles().length);
for (File f : src.listFiles()) {
if (!f.getName().contains(".DS_Store")){
- output[i++] = f.getPath().toString();
+ outArray.add(f.getPath().toString());
}
}
- return output;
+ return outArray.toArray(new String[outArray.size()]);
}
}
diff --git a/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat2_k3/repeat.txt b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat2_k3/repeat.txt
new file mode 100644
index 0000000..00b443b
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat2_k3/repeat.txt
@@ -0,0 +1 @@
+1 ATCGCGCAT
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat3_k3/repeat.txt b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat3_k3/repeat.txt
new file mode 100644
index 0000000..5a8db3a
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat3_k3/repeat.txt
@@ -0,0 +1 @@
+1 CGCGCGCATCGCGCCGCGCGCGGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCCG
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat_k3/repeat.txt b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat_k3/repeat.txt
new file mode 100644
index 0000000..2b9ee5b
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/input/sequence/multi_tandem_repeat/MultiTandemRepeat_k3/repeat.txt
@@ -0,0 +1 @@
+1 CGCGC
\ No newline at end of file