don't save intermediate results by default; add option to do so
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java
index bba6d7e..346041b 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/config/GenomixJobConf.java
@@ -64,9 +64,12 @@
@Option(name = "-hdfsOutput", usage = "HDFS directory where the final step's output will be saved", required = false)
private String hdfsOutput;
- @Option(name = "-hdfsWorkPath", usage = "HDFS directory where pipeline temp ouptut will be saved", required = false)
+ @Option(name = "-hdfsWorkPath", usage = "HDFS directory where pipeline temp output will be saved", required = false)
private String hdfsWorkPath;
+ @Option(name = "-saveIntermediateResults", usage = "whether or not to save intermediate steps to HDFS (default: false)", required = false)
+ private boolean saveIntermediateResults = false;
+
// Graph cleaning
@Option(name = "-bridgeRemove_maxLength", usage = "Nodes with length <= bridgeRemoveLength that bridge separate paths are removed from the graph", required = false)
@@ -156,6 +159,7 @@
public static final String FINAL_OUTPUT_DIR = "genomix.final.output.dir";
public static final String LOCAL_INPUT_DIR = "genomix.initial.local.input.dir";
public static final String LOCAL_OUTPUT_DIR = "genomix.final.local.output.dir";
+ public static final String SAVE_INTERMEDIATE_RESULTS = "genomix.save.intermediate.results";
// Graph cleaning
public static final String BRIDGE_REMOVE_MAX_LENGTH = "genomix.bridgeRemove.maxLength";
@@ -206,6 +210,14 @@
public static final String OUTPUT_FORMAT_BINARY = "genomix.outputformat.binary";
public static final String OUTPUT_FORMAT_TEXT = "genomix.outputformat.text";
public static final String HDFS_WORK_PATH = "genomix.hdfs.work.path";
+ private static final Patterns[] DEFAULT_PIPELINE_ORDER = {
+ Patterns.BUILD, Patterns.MERGE,
+ Patterns.TIP_REMOVE, Patterns.MERGE,
+ Patterns.BUBBLE, Patterns.MERGE,
+ Patterns.LOW_COVERAGE, Patterns.MERGE,
+ Patterns.SPLIT_REPEAT, Patterns.MERGE,
+ Patterns.SCAFFOLD, Patterns.MERGE
+ };
private String[] extraArguments = {};
@@ -310,15 +322,7 @@
setInt(TIP_REMOVE_MAX_LENGTH, kmerLength + 1);
if (get(PIPELINE_ORDER) == null) {
- Patterns[] steps = {
- Patterns.BUILD, Patterns.MERGE,
- Patterns.TIP_REMOVE, Patterns.MERGE,
- Patterns.BUBBLE, Patterns.MERGE,
- Patterns.LOW_COVERAGE, Patterns.MERGE,
- Patterns.SPLIT_REPEAT, Patterns.MERGE,
- Patterns.SCAFFOLD, Patterns.MERGE
- };
- set(PIPELINE_ORDER, Patterns.stringFromArray(steps));
+ set(PIPELINE_ORDER, Patterns.stringFromArray(DEFAULT_PIPELINE_ORDER));
}
// hdfs setup
if (get(HDFS_WORK_PATH) == null)
@@ -355,6 +359,8 @@
set(LOCAL_OUTPUT_DIR, opts.localOutput);
if (opts.hdfsWorkPath != null)
set(HDFS_WORK_PATH, opts.hdfsWorkPath);
+ setBoolean(SAVE_INTERMEDIATE_RESULTS, opts.saveIntermediateResults);
+
if (opts.runLocal && (opts.ipAddress != null || opts.port != -1))
throw new IllegalArgumentException("Option -runLocal cannot be set at the same time as -port or -ip! (-runLocal starts a cluster; -ip and -port specify an existing cluster)");
diff --git a/genomix/genomix-driver/src/main/java/edu/uci/ics/genomix/driver/GenomixDriver.java b/genomix/genomix-driver/src/main/java/edu/uci/ics/genomix/driver/GenomixDriver.java
index 4ce1c23..aeae637 100644
--- a/genomix/genomix-driver/src/main/java/edu/uci/ics/genomix/driver/GenomixDriver.java
+++ b/genomix/genomix-driver/src/main/java/edu/uci/ics/genomix/driver/GenomixDriver.java
@@ -236,15 +236,18 @@
break;
}
}
- for (int i = 0; i < jobs.size(); i++) {
- // pregelixDriver = new edu.uci.ics.pregelix.core.driver.Driver(jobs.get(i).getConfiguration().getClass(PregelixJob.VERTEX_CLASS, null));
- // pregelixDriver = new edu.uci.ics.pregelix.core.driver.Driver(jobs.get(i).getConfiguration().getClass(PregelixJob.VERTEX_CLASS, null));
- pregelixDriver.runJob(jobs.get(i), conf.get(GenomixJobConf.IP_ADDRESS),
- Integer.parseInt(conf.get(GenomixJobConf.PORT)));
+
+ // if the user wants to, we can save the intermediate results to HDFS (running each job individually)
+ // this would let them resume at arbitrary points of the pipeline
+ if (Boolean.parseBoolean(conf.get(GenomixJobConf.SAVE_INTERMEDIATE_RESULTS))) {
+ for (int i = 0; i < jobs.size(); i++) {
+ pregelixDriver.runJob(jobs.get(i), conf.get(GenomixJobConf.IP_ADDRESS),
+ Integer.parseInt(conf.get(GenomixJobConf.PORT)));
+ }
+ } else {
+ pregelixDriver.runJobs(jobs, conf.get(GenomixJobConf.IP_ADDRESS), Integer.parseInt(conf.get(GenomixJobConf.PORT)));
}
- // pregelixDriver.runJobs(jobs, conf.get(GenomixJobConf.IP_ADDRESS), Integer.parseInt(conf.get(GenomixJobConf.PORT)));
-
if (conf.get(GenomixJobConf.LOCAL_OUTPUT_DIR) != null)
copyBinToLocal(conf, curOutput, conf.get(GenomixJobConf.LOCAL_OUTPUT_DIR));
if (conf.get(GenomixJobConf.FINAL_OUTPUT_DIR) != null)
@@ -256,6 +259,7 @@
public static void main(String[] args) throws CmdLineException, NumberFormatException, HyracksException, Exception {
String[] myArgs = { "-runLocal", "-kmerLength", "3",
+ "-saveIntermediateResults", "true",
// "-localInput", "../genomix-pregelix/data/input/reads/synthetic/",
"-localInput", "../genomix-pregelix/data/input/reads/pathmerge",
// "-localInput", "/home/wbiesing/code/hyracks/genomix/genomix-pregelix/data/input/reads/test",
@@ -264,7 +268,11 @@
// "-pipelineOrder", "BUILD,MERGE",
// "-inputDir", "/home/wbiesing/code/hyracks/genomix/genomix-driver/graphbuild.binmerge",
// "-localInput", "../genomix-pregelix/data/TestSet/PathMerge/CyclePath/bin/part-00000",
- "-pipelineOrder", "BUILD_HADOOP,MERGE,TIP_REMOVE" };
+ "-pipelineOrder", "BUILD_HADOOP,MERGE,TIP_REMOVE,MERGE,BUBBLE,MERGE" };
+
+// Patterns.BUILD, Patterns.MERGE,
+// Patterns.TIP_REMOVE, Patterns.MERGE,
+// Patterns.BUBBLE, Patterns.MERGE,
GenomixJobConf conf = GenomixJobConf.fromArguments(myArgs);
GenomixDriver driver = new GenomixDriver();
driver.runGenomix(conf);