WIP: H3 implementation, using multiple tails
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java
new file mode 100644
index 0000000..94f44b8
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathMultiSeqOutputFormat;
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
+import edu.uci.ics.genomix.hadoop.pmcommon.SNodeInitialMapper;
+import edu.uci.ics.genomix.hadoop.pmcommon.SNodeInitialReducer;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+@SuppressWarnings("deprecation")
+public class MergePathH3Driver {
+
+ private static class Options {
+ @Option(name = "-inputpath", usage = "the input path", required = true)
+ public String inputPath;
+
+ @Option(name = "-outputpath", usage = "the output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-mergeresultpath", usage = "the merging results path", required = true)
+ public String mergeResultPath;
+
+ @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+ public int numReducers;
+
+ @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
+ public int sizeKmer;
+
+ @Option(name = "-merge-rounds", usage = "the while rounds of merging", required = true)
+ public int mergeRound;
+
+ }
+
+ public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer,
+ int mergeRound, String defaultConfPath) throws IOException {
+ JobConf conf;
+ FileSystem dfs = FileSystem.get(conf);
+ String prevOutput = inputPath;
+
+ for (int iMerge = 1; iMerge <= mergeRound; iMerge++) {
+ conf = makeJobConf(sizeKmer, iMerge, numReducers, defaultConfPath);
+
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ dfs.delete(new Path(inputPath + "stepNext"), true);
+ dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext"));
+ dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath));
+ dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle));
+ }
+ }
+
+ private JobConf makeJobConf(int sizeKmer, int iMerge, int numReducers, String defaultConfPath) {
+ JobConf conf = new JobConf(MergePathH3Driver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+ conf.setInt("iMerge", iMerge);
+ conf.setJobName("Path Merge-" + iMerge);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+
+ conf.setMapperClass(MergePathH3Mapper.class);
+ conf.setReducerClass(MergePathH3Reducer.class);
+ conf.setMapOutputKeyClass(VKmerBytesWritable.class);
+ conf.setMapOutputValueClass(MergePathValueWritable.class);
+ conf.setInputFormat(SequenceFileInputFormat.class);
+
+ MultipleOutputs.addNamedOutput(conf, "unmergedSinglePath" + iMerge, MergePathMultiSeqOutputFormat.class,
+ VKmerBytesWritable.class, MergePathValueWritable.class);
+
+ MultipleOutputs.addNamedOutput(conf, "mergedSinglePath" + iMerge, MergePathMultiSeqOutputFormat.class,
+ VKmerBytesWritable.class, MergePathValueWritable.class);
+
+ conf.setOutputKeyClass(VKmerBytesWritable.class);
+ conf.setOutputValueClass(MergePathValueWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext"));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+ MergePathH3Driver driver = new MergePathH3Driver();
+ driver.run(options.inputPath, options.outputPath, options.mergeResultPath, options.numReducers,
+ options.sizeKmer, options.mergeRound, null);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java
new file mode 100644
index 0000000..a24feb6
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+import java.util.Random;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
+import edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3.State;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+import edu.uci.ics.genomix.pregelix.util.VertexUtil;
+
+@SuppressWarnings("deprecation")
+public class MergePathH3Mapper extends MapReduceBase implements
+ Mapper<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+
+ private int KMER_SIZE;
+ private VKmerBytesWritableFactory outputKmerFactory;
+ private MergePathValueWritable outputValue;
+ private VKmerBytesWritable tmpKmer;
+ private VKmerBytesWritable outputKmer;
+
+ private Random randGenerator;
+ private float probBeingRandomHead;
+
+ public void configure(JobConf job) {
+ KMER_SIZE = job.getInt("sizeKmer", 0);
+ outputKmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputValue = new MergePathValueWritable();
+ tmpKmer = new VKmerBytesWritable(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
+
+ randGenerator = new Random(job.getLong("randSeed", 0));
+ probBeingRandomHead = job.getFloat("probBeingRandomHead", 0.5f);
+ }
+
+ private boolean isNodeRandomHead() {
+ return randGenerator.nextFloat() < probBeingRandomHead;
+ }
+
+ /*
+ * Retrieve the kmer corresponding to the single predecessor of kmerChain
+ * Make sure there is really only one neighbor in adjBitMap
+ */
+ private KmerBytesWritable getPredecessorKmer(KmerBytesWritable kmerChain, byte adjBitMap) {
+ byte preCode = (byte) ((adjBitMap & 0xF0) >> 4);
+ preCode = GeneCode.getGeneCodeFromBitMap(preCode);
+ return outputKmerFactory.shiftKmerWithPreCode(outputKmerFactory.getFirstKmerFromChain(KMER_SIZE, kmerChain),
+ preCode);
+ }
+
+ @Override
+ public void map(VKmerBytesWritable key, MergePathValueWritable value,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ byte adjBitMap = value.getAdjBitMap();
+
+ // Map all path vertices; tail nodes are sent to their predecessors
+ if (VertexUtil.isPathVertex(adjBitMap)) {
+ if (isNodeRandomHead()) {
+ // head nodes map themselves
+ outputValue.set(adjBitMap, State.FROM_SELF, key);
+ output.collect(key, outputValue);
+ } else {
+ // tail nodes send themselves to their predecessor
+ outputKmer.set(getPredecessorKmer(key, adjBitMap));
+ outputValue.set(adjBitMap, State.FROM_SUCCESSOR, key);
+ output.collect(outputKmer, outputValue);
+ }
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java
new file mode 100644
index 0000000..a367e81
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+@SuppressWarnings("deprecation")
+public class MergePathH3Reducer extends MapReduceBase implements
+ Reducer<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+
+ private int KMER_SIZE;
+ MultipleOutputs mos = null;
+ private int I_MERGE;
+
+ private VKmerBytesWritableFactory kmerFactory;
+ private MergePathValueWritable inputValue;
+ private VKmerBytesWritable outputKmer;
+ private MergePathValueWritable outputValue;
+
+ private VKmerBytesWritable headKmer;
+ private VKmerBytesWritable tailKmer;
+ private byte outputAdjMap;
+
+ public void configure(JobConf job) {
+ mos = new MultipleOutputs(job);
+ I_MERGE = Integer.parseInt(job.get("iMerge"));
+ KMER_SIZE = job.getInt("sizeKmer", 0);
+
+ inputValue = new MergePathValueWritable();
+ headKmer = new VKmerBytesWritable(KMER_SIZE);
+ tailKmer = new VKmerBytesWritable(KMER_SIZE);
+
+ outputValue = new MergePathValueWritable();
+ kmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
+ }
+
+ public void reduce(VKmerBytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+
+ inputValue = values.next();
+ if (!values.hasNext()) {
+ // all single nodes must be remapped
+ if (inputValue.getFlag() == State.FROM_SELF) {
+ // FROM_SELF => remap self
+ unmergedPathCollector(reporter).collect(key, inputValue);
+ } else {
+ // FROM_SUCCESSOR => remap successor
+ outputKmer.set(outputValue.getKmer());
+ outputValue.set(inputValue.getAdjBitMap(), State.EMPTY_MESSAGE, null);
+ unmergedPathCollector(reporter).collect(outputKmer, outputValue);
+ }
+ } else {
+ // multiple inputs => a merge will take place. Aggregate both, then collect the merged path
+ outputAdjMap = (byte) 0;
+ do {
+ if (inputValue.getFlag() == State.FROM_SELF) {
+ // this is the head node.
+ outputAdjMap |= inputValue.getAdjBitMap() & 0xF0; // keep head's predecessor
+ headKmer.set(inputValue.getKmer());
+ } else {
+ // this is the tail node.
+ outputAdjMap |= inputValue.getAdjBitMap() & 0x0F; // keep tail's successor
+ tailKmer.set(inputValue.getKmer());
+ }
+ } while (values.hasNext());
+ outputKmer.set(kmerFactory.mergeTwoKmer(headKmer, tailKmer));
+ outputValue.set(outputAdjMap, State.EMPTY_MESSAGE, null);
+ mergedPathCollector(reporter).collect(outputKmer, outputValue);
+ }
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ }
+
+ @SuppressWarnings("unchecked")
+ public OutputCollector<VKmerBytesWritable, MergePathValueWritable> mergedPathCollector(Reporter reporter)
+ throws IOException {
+ return mos.getCollector("mergedSinglePath" + I_MERGE, reporter);
+ }
+
+ @SuppressWarnings("unchecked")
+ public OutputCollector<VKmerBytesWritable, MergePathValueWritable> unmergedPathCollector(Reporter reporter)
+ throws IOException {
+ return mos.getCollector("unmergedSinglePath" + I_MERGE, reporter);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java
new file mode 100644
index 0000000..850530b
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java
@@ -0,0 +1,27 @@
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+public class State {
+
+ public static final byte EMPTY_MESSAGE = 0;
+ public static final byte FROM_SELF = 1;
+ public static final byte FROM_SUCCESSOR = 2;
+
+ public final static class STATE_CONTENT {
+
+ public static String getContentFromCode(byte code) {
+ String r = "ERROR_BAD_MESSAGE";
+ switch (code) {
+ case EMPTY_MESSAGE:
+ r = "EMPTY_MESSAGE";
+ break;
+ case FROM_SELF:
+ r = "FROM_SELF";
+ break;
+ case FROM_SUCCESSOR:
+ r = "FROM_SUCCESSOR";
+ break;
+ }
+ return r;
+ }
+ }
+}