WIP: H3 implementation, using multiple tails

commit: d891139d4ecd99f694c45e9ec5d9b4238ea99f5b [log] [tgz]
author: Jake Biesinger <jake.biesinger@gmail.com> Mon May 20 13:53:32 2013 -0700
committer: Jake Biesinger <jake.biesinger@gmail.com> Wed May 22 10:38:49 2013 -0700
tree: 890e7d8e38ead0c07f22c699f09267f7ce264f49
parent: 7c14c8f5895f53adc1a831b212adc74a794b743a [diff]
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java
new file mode 100644
index 0000000..94f44b8
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java

@@ -0,0 +1,119 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathMultiSeqOutputFormat;
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
+import edu.uci.ics.genomix.hadoop.pmcommon.SNodeInitialMapper;
+import edu.uci.ics.genomix.hadoop.pmcommon.SNodeInitialReducer;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+@SuppressWarnings("deprecation")
+public class MergePathH3Driver {
+
+    private static class Options {
+        @Option(name = "-inputpath", usage = "the input path", required = true)
+        public String inputPath;
+
+        @Option(name = "-outputpath", usage = "the output path", required = true)
+        public String outputPath;
+
+        @Option(name = "-mergeresultpath", usage = "the merging results path", required = true)
+        public String mergeResultPath;
+
+        @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+        public int numReducers;
+
+        @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
+        public int sizeKmer;
+
+        @Option(name = "-merge-rounds", usage = "the while rounds of merging", required = true)
+        public int mergeRound;
+
+    }
+
+    public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer,
+            int mergeRound, String defaultConfPath) throws IOException {
+        JobConf conf;
+        FileSystem dfs = FileSystem.get(conf);
+        String prevOutput = inputPath;
+        
+        for (int iMerge = 1; iMerge <= mergeRound; iMerge++) {
+            conf = makeJobConf(sizeKmer, iMerge, numReducers, defaultConfPath);
+
+            dfs.delete(new Path(outputPath), true);
+            JobClient.runJob(conf);
+            dfs.delete(new Path(inputPath + "stepNext"), true);
+            dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext"));
+            dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath));
+            dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle));
+        }
+    }
+    
+    private JobConf makeJobConf(int sizeKmer, int iMerge, int numReducers, String defaultConfPath) {
+        JobConf conf = new JobConf(MergePathH3Driver.class);
+        conf.setInt("sizeKmer", sizeKmer);
+        conf.setInt("iMerge", iMerge);
+        conf.setJobName("Path Merge-" + iMerge);
+        
+        if (defaultConfPath != null) {
+            conf.addResource(new Path(defaultConfPath));
+        }
+        
+        conf.setMapperClass(MergePathH3Mapper.class);
+        conf.setReducerClass(MergePathH3Reducer.class);
+        conf.setMapOutputKeyClass(VKmerBytesWritable.class);
+        conf.setMapOutputValueClass(MergePathValueWritable.class);
+        conf.setInputFormat(SequenceFileInputFormat.class);
+
+        MultipleOutputs.addNamedOutput(conf, "unmergedSinglePath" + iMerge, MergePathMultiSeqOutputFormat.class,
+                VKmerBytesWritable.class, MergePathValueWritable.class);
+
+        MultipleOutputs.addNamedOutput(conf, "mergedSinglePath" + iMerge, MergePathMultiSeqOutputFormat.class,
+                VKmerBytesWritable.class, MergePathValueWritable.class);
+
+        conf.setOutputKeyClass(VKmerBytesWritable.class);
+        conf.setOutputValueClass(MergePathValueWritable.class);
+
+        FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext"));
+        FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+        conf.setNumReduceTasks(numReducers);
+    }
+
+    public static void main(String[] args) throws Exception {
+        Options options = new Options();
+        CmdLineParser parser = new CmdLineParser(options);
+        parser.parseArgument(args);
+        MergePathH3Driver driver = new MergePathH3Driver();
+        driver.run(options.inputPath, options.outputPath, options.mergeResultPath, options.numReducers,
+                options.sizeKmer, options.mergeRound, null);
+    }
+}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java
new file mode 100644
index 0000000..a24feb6
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java

@@ -0,0 +1,91 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+import java.util.Random;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
+import edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3.State;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+import edu.uci.ics.genomix.pregelix.util.VertexUtil;
+
+@SuppressWarnings("deprecation")
+public class MergePathH3Mapper extends MapReduceBase implements
+        Mapper<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+
+    private int KMER_SIZE;
+    private VKmerBytesWritableFactory outputKmerFactory;
+    private MergePathValueWritable outputValue;
+    private VKmerBytesWritable tmpKmer;
+    private VKmerBytesWritable outputKmer;
+
+    private Random randGenerator;
+    private float probBeingRandomHead;
+
+    public void configure(JobConf job) {
+        KMER_SIZE = job.getInt("sizeKmer", 0);
+        outputKmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+        outputValue = new MergePathValueWritable();
+        tmpKmer = new VKmerBytesWritable(KMER_SIZE);
+        outputKmer = new VKmerBytesWritable(KMER_SIZE);
+
+        randGenerator = new Random(job.getLong("randSeed", 0));
+        probBeingRandomHead = job.getFloat("probBeingRandomHead", 0.5f);
+    }
+
+    private boolean isNodeRandomHead() {
+        return randGenerator.nextFloat() < probBeingRandomHead;
+    }
+
+    /*
+     * Retrieve the kmer corresponding to the single predecessor of kmerChain
+     * Make sure there is really only one neighbor in adjBitMap
+     */
+    private KmerBytesWritable getPredecessorKmer(KmerBytesWritable kmerChain, byte adjBitMap) {
+        byte preCode = (byte) ((adjBitMap & 0xF0) >> 4);
+        preCode = GeneCode.getGeneCodeFromBitMap(preCode);
+        return outputKmerFactory.shiftKmerWithPreCode(outputKmerFactory.getFirstKmerFromChain(KMER_SIZE, kmerChain),
+                preCode);
+    }
+
+    @Override
+    public void map(VKmerBytesWritable key, MergePathValueWritable value,
+            OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+        byte adjBitMap = value.getAdjBitMap();
+
+        // Map all path vertices; tail nodes are sent to their predecessors
+        if (VertexUtil.isPathVertex(adjBitMap)) {
+            if (isNodeRandomHead()) {
+                // head nodes map themselves
+                outputValue.set(adjBitMap, State.FROM_SELF, key);
+                output.collect(key, outputValue);
+            } else {
+                // tail nodes send themselves to their predecessor
+                outputKmer.set(getPredecessorKmer(key, adjBitMap));
+                outputValue.set(adjBitMap, State.FROM_SUCCESSOR, key);
+                output.collect(outputKmer, outputValue);
+            }
+        }
+    }
+}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java
new file mode 100644
index 0000000..a367e81
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java

@@ -0,0 +1,111 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+
+import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+@SuppressWarnings("deprecation")
+public class MergePathH3Reducer extends MapReduceBase implements
+        Reducer<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+
+    private int KMER_SIZE;
+    MultipleOutputs mos = null;
+    private int I_MERGE;
+
+    private VKmerBytesWritableFactory kmerFactory;
+    private MergePathValueWritable inputValue;
+    private VKmerBytesWritable outputKmer;
+    private MergePathValueWritable outputValue;
+
+    private VKmerBytesWritable headKmer;
+    private VKmerBytesWritable tailKmer;
+    private byte outputAdjMap;
+
+    public void configure(JobConf job) {
+        mos = new MultipleOutputs(job);
+        I_MERGE = Integer.parseInt(job.get("iMerge"));
+        KMER_SIZE = job.getInt("sizeKmer", 0);
+
+        inputValue = new MergePathValueWritable();
+        headKmer = new VKmerBytesWritable(KMER_SIZE);
+        tailKmer = new VKmerBytesWritable(KMER_SIZE);
+
+        outputValue = new MergePathValueWritable();
+        kmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+        outputKmer = new VKmerBytesWritable(KMER_SIZE);
+    }
+
+    public void reduce(VKmerBytesWritable key, Iterator<MergePathValueWritable> values,
+            OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+
+        inputValue = values.next();
+        if (!values.hasNext()) {
+            // all single nodes must be remapped
+            if (inputValue.getFlag() == State.FROM_SELF) {
+                // FROM_SELF => remap self
+                unmergedPathCollector(reporter).collect(key, inputValue);
+            } else {
+                // FROM_SUCCESSOR => remap successor
+                outputKmer.set(outputValue.getKmer());
+                outputValue.set(inputValue.getAdjBitMap(), State.EMPTY_MESSAGE, null);
+                unmergedPathCollector(reporter).collect(outputKmer, outputValue);
+            }
+        } else {
+            // multiple inputs => a merge will take place. Aggregate both, then collect the merged path
+            outputAdjMap = (byte) 0;
+            do {
+                if (inputValue.getFlag() == State.FROM_SELF) {
+                    // this is the head node.
+                    outputAdjMap |= inputValue.getAdjBitMap() & 0xF0; // keep head's predecessor  
+                    headKmer.set(inputValue.getKmer());
+                } else {
+                    // this is the tail node. 
+                    outputAdjMap |= inputValue.getAdjBitMap() & 0x0F; // keep tail's successor
+                    tailKmer.set(inputValue.getKmer());
+                }
+            } while (values.hasNext());
+            outputKmer.set(kmerFactory.mergeTwoKmer(headKmer, tailKmer));
+            outputValue.set(outputAdjMap, State.EMPTY_MESSAGE, null);
+            mergedPathCollector(reporter).collect(outputKmer, outputValue);
+        }
+    }
+
+    public void close() throws IOException {
+        mos.close();
+    }
+
+    @SuppressWarnings("unchecked")
+    public OutputCollector<VKmerBytesWritable, MergePathValueWritable> mergedPathCollector(Reporter reporter)
+            throws IOException {
+        return mos.getCollector("mergedSinglePath" + I_MERGE, reporter);
+    }
+
+    @SuppressWarnings("unchecked")
+    public OutputCollector<VKmerBytesWritable, MergePathValueWritable> unmergedPathCollector(Reporter reporter)
+            throws IOException {
+        return mos.getCollector("unmergedSinglePath" + I_MERGE, reporter);
+    }
+}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java
new file mode 100644
index 0000000..850530b
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java

@@ -0,0 +1,27 @@
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+public class State {
+
+    public static final byte EMPTY_MESSAGE = 0;
+    public static final byte FROM_SELF = 1;
+    public static final byte FROM_SUCCESSOR = 2;
+
+    public final static class STATE_CONTENT {
+
+        public static String getContentFromCode(byte code) {
+            String r = "ERROR_BAD_MESSAGE";
+            switch (code) {
+                case EMPTY_MESSAGE:
+                    r = "EMPTY_MESSAGE";
+                    break;
+                case FROM_SELF:
+                    r = "FROM_SELF";
+                    break;
+                case FROM_SUCCESSOR:
+                    r = "FROM_SUCCESSOR";
+                    break;
+            }
+            return r;
+        }
+    }
+}
commit	d891139d4ecd99f694c45e9ec5d9b4238ea99f5b	[log] [tgz]
author	Jake Biesinger <jake.biesinger@gmail.com>	Mon May 20 13:53:32 2013 -0700
committer	Jake Biesinger <jake.biesinger@gmail.com>	Wed May 22 10:38:49 2013 -0700
tree	890e7d8e38ead0c07f22c699f09267f7ce264f49
parent	7c14c8f5895f53adc1a831b212adc74a794b743a [diff]