H3 implemented, untested

commit: 01454bd8ba057d18068622df68c55c828fb730d7 [log] [tgz]
author: Jake Biesinger <jake.biesinger@gmail.com> Tue May 21 16:44:37 2013 -0700
committer: Jake Biesinger <jake.biesinger@gmail.com> Wed May 22 10:38:49 2013 -0700
tree: 8de0b48acd6429db2b6a260e91c41173def0be80
parent: 0669e986265ff8bb876ac44872b6002755e4a5cf [diff]
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java
deleted file mode 100644
index 94f44b8..0000000
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Driver.java
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- * Copyright 2009-2012 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
-
-import java.io.IOException;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.hadoop.mapred.TextOutputFormat;
-import org.apache.hadoop.mapred.lib.MultipleOutputs;
-import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
-import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
-import org.kohsuke.args4j.CmdLineParser;
-import org.kohsuke.args4j.Option;
-
-import edu.uci.ics.genomix.hadoop.pmcommon.MergePathMultiSeqOutputFormat;
-import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
-import edu.uci.ics.genomix.hadoop.pmcommon.SNodeInitialMapper;
-import edu.uci.ics.genomix.hadoop.pmcommon.SNodeInitialReducer;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
-import edu.uci.ics.genomix.type.VKmerBytesWritable;
-@SuppressWarnings("deprecation")
-public class MergePathH3Driver {
-
-    private static class Options {
-        @Option(name = "-inputpath", usage = "the input path", required = true)
-        public String inputPath;
-
-        @Option(name = "-outputpath", usage = "the output path", required = true)
-        public String outputPath;
-
-        @Option(name = "-mergeresultpath", usage = "the merging results path", required = true)
-        public String mergeResultPath;
-
-        @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
-        public int numReducers;
-
-        @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
-        public int sizeKmer;
-
-        @Option(name = "-merge-rounds", usage = "the while rounds of merging", required = true)
-        public int mergeRound;
-
-    }
-
-    public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer,
-            int mergeRound, String defaultConfPath) throws IOException {
-        JobConf conf;
-        FileSystem dfs = FileSystem.get(conf);
-        String prevOutput = inputPath;
-        
-        for (int iMerge = 1; iMerge <= mergeRound; iMerge++) {
-            conf = makeJobConf(sizeKmer, iMerge, numReducers, defaultConfPath);
-
-            dfs.delete(new Path(outputPath), true);
-            JobClient.runJob(conf);
-            dfs.delete(new Path(inputPath + "stepNext"), true);
-            dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext"));
-            dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath));
-            dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle));
-        }
-    }
-    
-    private JobConf makeJobConf(int sizeKmer, int iMerge, int numReducers, String defaultConfPath) {
-        JobConf conf = new JobConf(MergePathH3Driver.class);
-        conf.setInt("sizeKmer", sizeKmer);
-        conf.setInt("iMerge", iMerge);
-        conf.setJobName("Path Merge-" + iMerge);
-        
-        if (defaultConfPath != null) {
-            conf.addResource(new Path(defaultConfPath));
-        }
-        
-        conf.setMapperClass(MergePathH3Mapper.class);
-        conf.setReducerClass(MergePathH3Reducer.class);
-        conf.setMapOutputKeyClass(VKmerBytesWritable.class);
-        conf.setMapOutputValueClass(MergePathValueWritable.class);
-        conf.setInputFormat(SequenceFileInputFormat.class);
-
-        MultipleOutputs.addNamedOutput(conf, "unmergedSinglePath" + iMerge, MergePathMultiSeqOutputFormat.class,
-                VKmerBytesWritable.class, MergePathValueWritable.class);
-
-        MultipleOutputs.addNamedOutput(conf, "mergedSinglePath" + iMerge, MergePathMultiSeqOutputFormat.class,
-                VKmerBytesWritable.class, MergePathValueWritable.class);
-
-        conf.setOutputKeyClass(VKmerBytesWritable.class);
-        conf.setOutputValueClass(MergePathValueWritable.class);
-
-        FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext"));
-        FileOutputFormat.setOutputPath(conf, new Path(outputPath));
-        conf.setNumReduceTasks(numReducers);
-    }
-
-    public static void main(String[] args) throws Exception {
-        Options options = new Options();
-        CmdLineParser parser = new CmdLineParser(options);
-        parser.parseArgument(args);
-        MergePathH3Driver driver = new MergePathH3Driver();
-        driver.run(options.inputPath, options.outputPath, options.mergeResultPath, options.numReducers,
-                options.sizeKmer, options.mergeRound, null);
-    }
-}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java
deleted file mode 100644
index a24feb6..0000000
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Mapper.java
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Copyright 2009-2012 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
-
-import java.io.IOException;
-import java.util.Random;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.MapReduceBase;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reporter;
-
-import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
-import edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3.State;
-import edu.uci.ics.genomix.type.GeneCode;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
-import edu.uci.ics.genomix.type.VKmerBytesWritable;
-import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
-import edu.uci.ics.genomix.pregelix.util.VertexUtil;
-
-@SuppressWarnings("deprecation")
-public class MergePathH3Mapper extends MapReduceBase implements
-        Mapper<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
-
-    private int KMER_SIZE;
-    private VKmerBytesWritableFactory outputKmerFactory;
-    private MergePathValueWritable outputValue;
-    private VKmerBytesWritable tmpKmer;
-    private VKmerBytesWritable outputKmer;
-
-    private Random randGenerator;
-    private float probBeingRandomHead;
-
-    public void configure(JobConf job) {
-        KMER_SIZE = job.getInt("sizeKmer", 0);
-        outputKmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
-        outputValue = new MergePathValueWritable();
-        tmpKmer = new VKmerBytesWritable(KMER_SIZE);
-        outputKmer = new VKmerBytesWritable(KMER_SIZE);
-
-        randGenerator = new Random(job.getLong("randSeed", 0));
-        probBeingRandomHead = job.getFloat("probBeingRandomHead", 0.5f);
-    }
-
-    private boolean isNodeRandomHead() {
-        return randGenerator.nextFloat() < probBeingRandomHead;
-    }
-
-    /*
-     * Retrieve the kmer corresponding to the single predecessor of kmerChain
-     * Make sure there is really only one neighbor in adjBitMap
-     */
-    private KmerBytesWritable getPredecessorKmer(KmerBytesWritable kmerChain, byte adjBitMap) {
-        byte preCode = (byte) ((adjBitMap & 0xF0) >> 4);
-        preCode = GeneCode.getGeneCodeFromBitMap(preCode);
-        return outputKmerFactory.shiftKmerWithPreCode(outputKmerFactory.getFirstKmerFromChain(KMER_SIZE, kmerChain),
-                preCode);
-    }
-
-    @Override
-    public void map(VKmerBytesWritable key, MergePathValueWritable value,
-            OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
-        byte adjBitMap = value.getAdjBitMap();
-
-        // Map all path vertices; tail nodes are sent to their predecessors
-        if (VertexUtil.isPathVertex(adjBitMap)) {
-            if (isNodeRandomHead()) {
-                // head nodes map themselves
-                outputValue.set(adjBitMap, State.FROM_SELF, key);
-                output.collect(key, outputValue);
-            } else {
-                // tail nodes send themselves to their predecessor
-                outputKmer.set(getPredecessorKmer(key, adjBitMap));
-                outputValue.set(adjBitMap, State.FROM_SUCCESSOR, key);
-                output.collect(outputKmer, outputValue);
-            }
-        }
-    }
-}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java
deleted file mode 100644
index a367e81..0000000
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathH3Reducer.java
+++ /dev/null

@@ -1,111 +0,0 @@
-/*
- * Copyright 2009-2012 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
-
-import java.io.IOException;
-import java.util.Iterator;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.MapReduceBase;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.lib.MultipleOutputs;
-
-import edu.uci.ics.genomix.hadoop.pmcommon.MergePathValueWritable;
-import edu.uci.ics.genomix.type.VKmerBytesWritable;
-import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
-
-@SuppressWarnings("deprecation")
-public class MergePathH3Reducer extends MapReduceBase implements
-        Reducer<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
-
-    private int KMER_SIZE;
-    MultipleOutputs mos = null;
-    private int I_MERGE;
-
-    private VKmerBytesWritableFactory kmerFactory;
-    private MergePathValueWritable inputValue;
-    private VKmerBytesWritable outputKmer;
-    private MergePathValueWritable outputValue;
-
-    private VKmerBytesWritable headKmer;
-    private VKmerBytesWritable tailKmer;
-    private byte outputAdjMap;
-
-    public void configure(JobConf job) {
-        mos = new MultipleOutputs(job);
-        I_MERGE = Integer.parseInt(job.get("iMerge"));
-        KMER_SIZE = job.getInt("sizeKmer", 0);
-
-        inputValue = new MergePathValueWritable();
-        headKmer = new VKmerBytesWritable(KMER_SIZE);
-        tailKmer = new VKmerBytesWritable(KMER_SIZE);
-
-        outputValue = new MergePathValueWritable();
-        kmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
-        outputKmer = new VKmerBytesWritable(KMER_SIZE);
-    }
-
-    public void reduce(VKmerBytesWritable key, Iterator<MergePathValueWritable> values,
-            OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
-
-        inputValue = values.next();
-        if (!values.hasNext()) {
-            // all single nodes must be remapped
-            if (inputValue.getFlag() == State.FROM_SELF) {
-                // FROM_SELF => remap self
-                unmergedPathCollector(reporter).collect(key, inputValue);
-            } else {
-                // FROM_SUCCESSOR => remap successor
-                outputKmer.set(outputValue.getKmer());
-                outputValue.set(inputValue.getAdjBitMap(), State.EMPTY_MESSAGE, null);
-                unmergedPathCollector(reporter).collect(outputKmer, outputValue);
-            }
-        } else {
-            // multiple inputs => a merge will take place. Aggregate both, then collect the merged path
-            outputAdjMap = (byte) 0;
-            do {
-                if (inputValue.getFlag() == State.FROM_SELF) {
-                    // this is the head node.
-                    outputAdjMap |= inputValue.getAdjBitMap() & 0xF0; // keep head's predecessor  
-                    headKmer.set(inputValue.getKmer());
-                } else {
-                    // this is the tail node. 
-                    outputAdjMap |= inputValue.getAdjBitMap() & 0x0F; // keep tail's successor
-                    tailKmer.set(inputValue.getKmer());
-                }
-            } while (values.hasNext());
-            outputKmer.set(kmerFactory.mergeTwoKmer(headKmer, tailKmer));
-            outputValue.set(outputAdjMap, State.EMPTY_MESSAGE, null);
-            mergedPathCollector(reporter).collect(outputKmer, outputValue);
-        }
-    }
-
-    public void close() throws IOException {
-        mos.close();
-    }
-
-    @SuppressWarnings("unchecked")
-    public OutputCollector<VKmerBytesWritable, MergePathValueWritable> mergedPathCollector(Reporter reporter)
-            throws IOException {
-        return mos.getCollector("mergedSinglePath" + I_MERGE, reporter);
-    }
-
-    @SuppressWarnings("unchecked")
-    public OutputCollector<VKmerBytesWritable, MergePathValueWritable> unmergedPathCollector(Reporter reporter)
-            throws IOException {
-        return mos.getCollector("unmergedSinglePath" + I_MERGE, reporter);
-    }
-}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3.java
new file mode 100644
index 0000000..66dac41
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3.java

@@ -0,0 +1,202 @@
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import edu.uci.ics.genomix.type.NodeWritable;
+import edu.uci.ics.genomix.type.PositionWritable;
+
+@SuppressWarnings("deprecation")
+public class MergePathsH3 extends Configured implements Tool {
+    /*
+     * Flags used when sending messages
+     */
+    public static class MessageFlag {
+        public static final byte EMPTY_MESSAGE = 0;
+        public static final byte FROM_SELF = 1;
+        public static final byte FROM_SUCCESSOR = 1 << 1;
+        public static final byte IS_HEAD = 1 << 2;
+        public static final byte FROM_PREDECESSOR = 1 << 3;
+
+        public static String getFlagAsString(byte code) {
+            // TODO: allow multiple flags to be set
+            switch (code) {
+                case EMPTY_MESSAGE:
+                    return "EMPTY_MESSAGE";
+                case FROM_SELF:
+                    return "FROM_SELF";
+                case FROM_SUCCESSOR:
+                    return "FROM_SUCCESSOR";
+            }
+            return "ERROR_BAD_MESSAGE";
+        }
+    }
+
+    /*
+     * Mapper class: Partition the graph according  pseudoheads send themselves to their 
+     * successors, and all others map themselves.
+     */
+    private static class MergePathsH3Mapper extends MapReduceBase implements
+            Mapper<PositionWritable, MessageWritableH3, PositionWritable, MessageWritableH3> {
+
+        private static long randSeed;
+        private Random randGenerator;
+        private float probBeingRandomHead;
+
+        private int KMER_SIZE;
+        private PositionWritable outputKey;
+        private MessageWritableH3 outputValue;
+        private NodeWritable curNode;
+
+        public void configure(JobConf conf) {
+            randSeed = conf.getLong("randomSeed", 0);
+            randGenerator = new Random(randSeed);
+            probBeingRandomHead = conf.getFloat("probBeingRandomHead", 0.5f);
+
+            KMER_SIZE = conf.getInt("sizeKmer", 0);
+            outputValue = new MessageWritableH3(KMER_SIZE);
+            outputKey = new PositionWritable();
+            curNode = new NodeWritable(KMER_SIZE);
+        }
+
+        private boolean isNodeRandomHead(PositionWritable nodeID) {
+            // "deterministically random", based on node id
+            randGenerator.setSeed(randSeed ^ nodeID.hashCode());
+            return randGenerator.nextFloat() < probBeingRandomHead;
+        }
+
+        @Override
+        public void map(PositionWritable key, MessageWritableH3 value,
+                OutputCollector<PositionWritable, MessageWritableH3> output, Reporter reporter) throws IOException {
+            curNode = value.getNode();
+            // Map all path vertices; tail nodes are sent to their predecessors
+            if (curNode.isPathNode()) {
+                boolean isHead = (value.getFlag() & MessageFlag.IS_HEAD) == MessageFlag.IS_HEAD;
+                if (isHead || isNodeRandomHead(curNode.getNodeID())) {
+                    // head nodes send themselves to their successor
+                    outputKey = curNode.getOutgoingList().getPosition(0); // TODO: does this need to be a .set call?
+                    outputValue.set((byte) (MessageFlag.FROM_PREDECESSOR | MessageFlag.IS_HEAD), curNode);
+                    output.collect(outputKey, outputValue);
+                } else {
+                    // tail nodes map themselves
+                    outputValue.set(MessageFlag.FROM_SELF, curNode);
+                    output.collect(key, outputValue);
+                }
+            }
+        }
+    }
+
+    /*
+     * Reducer class: merge nodes that co-occur; for singletons, remap the original nodes 
+     */
+    private static class MergePathsH3Reducer extends MapReduceBase implements
+            Reducer<PositionWritable, MessageWritableH3, PositionWritable, MessageWritableH3> {
+
+        private int KMER_SIZE;
+        private MessageWritableH3 inputValue;
+        private MessageWritableH3 outputValue;
+        private NodeWritable headNode;
+        private NodeWritable tailNode;
+        private int count;
+
+        public void configure(JobConf conf) {
+            KMER_SIZE = conf.getInt("sizeKmer", 0);
+            outputValue = new MessageWritableH3(KMER_SIZE);
+            headNode = new NodeWritable(KMER_SIZE);
+            tailNode = new NodeWritable(KMER_SIZE);
+        }
+
+        @Override
+        public void reduce(PositionWritable key, Iterator<MessageWritableH3> values,
+                OutputCollector<PositionWritable, MessageWritableH3> output, Reporter reporter) throws IOException {
+
+            inputValue = values.next();
+            if (!values.hasNext()) {
+                // all single nodes must be remapped
+                if ((inputValue.getFlag() & MessageFlag.FROM_SELF) == MessageFlag.FROM_SELF) {
+                    // FROM_SELF => remap self
+                    output.collect(key, inputValue);
+                } else {
+                    // FROM_PREDECESSOR => remap predecessor
+                    output.collect(inputValue.getNode().getNodeID(), inputValue);
+                }
+            } else {
+                // multiple inputs => a merge will take place. Aggregate both, then collect the merged path
+                count = 0;
+                do {
+                    if ((inputValue.getFlag() & MessageFlag.FROM_PREDECESSOR) == MessageFlag.FROM_PREDECESSOR) {
+                        headNode.set(inputValue.getNode());
+                    } else {
+                        tailNode.set(inputValue.getNode());
+                    }
+                    count++;
+                } while (values.hasNext());
+                if (count != 2) {
+                    throw new IOException("Expected two nodes in MergePathsH3 reduce; saw " + String.valueOf(count));
+                }
+                // merge the head and tail as saved output
+                tailNode.mergePreviousWithinOneRead(headNode);
+                outputValue.set(inputValue.getFlag(), tailNode);
+                output.collect(key, outputValue);
+            }
+        }
+    }
+
+    /*
+     * Run one iteration of the mergePaths algorithm
+     */
+    public RunningJob run(String inputPath, String outputPath, JobConf baseConf) throws IOException {
+        JobConf conf = new JobConf(baseConf);
+        conf.setJarByClass(MergePathsH3.class);
+        conf.setJobName("MergePathsH3 " + inputPath);
+
+        FileInputFormat.addInputPath(conf, new Path(inputPath));
+        FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+        
+
+        //TODO: verify input format
+        conf.setInputFormat(SequenceFileInputFormat.class);
+
+        conf.setMapOutputKeyClass(PositionWritable.class);
+        conf.setMapOutputValueClass(MessageWritableH3.class);
+        conf.setOutputKeyClass(PositionWritable.class);
+        conf.setOutputValueClass(MessageWritableH3.class);
+
+        conf.setMapperClass(MergePathsH3Mapper.class);
+        conf.setReducerClass(MergePathsH3Reducer.class);
+
+        FileSystem.get(conf).delete(new Path(outputPath), true);
+
+        return JobClient.runJob(conf);
+    }
+
+    @Override
+    public int run(String[] arg0) throws Exception {
+        // TODO Auto-generated method stub
+        return 0;
+    }
+
+    public static void main(String[] args) throws Exception {
+        int res = ToolRunner.run(new Configuration(), new MergePathsH3(), args);
+        System.exit(res);
+    }
+}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3Driver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3Driver.java
new file mode 100644
index 0000000..3637a71
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3Driver.java

@@ -0,0 +1,76 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+@SuppressWarnings("deprecation")
+public class MergePathsH3Driver {
+
+    private static class Options {
+        @Option(name = "-inputpath", usage = "the input path", required = true)
+        public String inputPath;
+
+        @Option(name = "-outputpath", usage = "the output path", required = true)
+        public String outputPath;
+
+        @Option(name = "-mergeresultpath", usage = "the merging results path", required = true)
+        public String mergeResultPath;
+
+        @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+        public int numReducers;
+
+        @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
+        public int sizeKmer;
+
+        @Option(name = "-merge-rounds", usage = "the while rounds of merging", required = true)
+        public int mergeRound;
+
+    }
+
+    public void run(String inputPath, String outputPath, int numReducers, int sizeKmer, int mergeRound)
+            throws IOException {
+        JobConf baseConf = new JobConf(); // I don't know the semantics here.  do i use a base conf file or something?
+        baseConf.setNumReduceTasks(numReducers);
+        baseConf.setInt("sizeKmer", sizeKmer);
+
+        FileSystem dfs = FileSystem.get(baseConf);
+        String prevOutput = inputPath;
+        dfs.delete(new Path(outputPath), true); // clear any previous output
+
+        String tmpOutputPath = "NO_JOBS_DONE";
+        for (int iMerge = 1; iMerge <= mergeRound; iMerge++) {
+            MergePathsH3 merger = new MergePathsH3();
+            tmpOutputPath = inputPath + ".mergepathsH3." + String.valueOf(iMerge);
+            merger.run(prevOutput, tmpOutputPath, baseConf);
+        }
+        dfs.rename(new Path(tmpOutputPath), new Path(outputPath)); // save final results
+    }
+
+    public static void main(String[] args) throws Exception {
+        Options options = new Options();
+        CmdLineParser parser = new CmdLineParser(options);
+        parser.parseArgument(args);
+        MergePathsH3Driver driver = new MergePathsH3Driver();
+        driver.run(options.inputPath, options.outputPath, options.numReducers, 
+                options.sizeKmer, options.mergeRound);
+    }
+}

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MessageWritableH3.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MessageWritableH3.java
new file mode 100644
index 0000000..84c1049
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MessageWritableH3.java

@@ -0,0 +1,74 @@
+package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+
+import edu.uci.ics.genomix.type.NodeWritable;
+
+public class MessageWritableH3 extends BinaryComparable implements WritableComparable<BinaryComparable> {
+    private byte flag;
+    private NodeWritable node;
+
+    public MessageWritableH3(int k) {
+        this.flag = 0;
+        this.node = new NodeWritable(k);
+    }
+
+    public MessageWritableH3(byte flag, int kmerSize) {
+        this.flag = flag;
+        this.node = new NodeWritable(kmerSize);
+    }
+
+    public void set(MessageWritableH3 right) {
+        set(right.getFlag(), right.getNode());
+    }
+
+    public void set(byte flag, NodeWritable node) {
+        this.node.set(node);
+        this.flag = flag;
+    }
+
+    @Override
+    public void readFields(DataInput arg0) throws IOException {
+        node.readFields(arg0);
+        flag = arg0.readByte();
+    }
+
+    @Override
+    public void write(DataOutput arg0) throws IOException {
+        node.write(arg0);
+        arg0.writeByte(flag);
+    }
+
+    public NodeWritable getNode() {
+        if (node.getCount() != 0) {
+            return node;
+        }
+        return null;
+    }
+
+    public byte getFlag() {
+        return this.flag;
+    }
+
+    public String toString() {
+        return node.toString() + '\t' + String.valueOf(flag);
+    }
+
+    @Override
+    public byte[] getBytes() {
+        if (node.getCount() != 0) {
+            return node.getKmer().getBytes();
+        } else
+            return null;
+    }
+
+    @Override
+    public int getLength() {
+        return node.getCount();
+    }
+}
\ No newline at end of file

diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java
deleted file mode 100644
index 850530b..0000000
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/State.java
+++ /dev/null

@@ -1,27 +0,0 @@
-package edu.uci.ics.genomix.hadoop.graphclean.mergepaths.h3;
-
-public class State {
-
-    public static final byte EMPTY_MESSAGE = 0;
-    public static final byte FROM_SELF = 1;
-    public static final byte FROM_SUCCESSOR = 2;
-
-    public final static class STATE_CONTENT {
-
-        public static String getContentFromCode(byte code) {
-            String r = "ERROR_BAD_MESSAGE";
-            switch (code) {
-                case EMPTY_MESSAGE:
-                    r = "EMPTY_MESSAGE";
-                    break;
-                case FROM_SELF:
-                    r = "FROM_SELF";
-                    break;
-                case FROM_SUCCESSOR:
-                    r = "FROM_SUCCESSOR";
-                    break;
-            }
-            return r;
-        }
-    }
-}
commit	01454bd8ba057d18068622df68c55c828fb730d7	[log] [tgz]
author	Jake Biesinger <jake.biesinger@gmail.com>	Tue May 21 16:44:37 2013 -0700
committer	Jake Biesinger <jake.biesinger@gmail.com>	Wed May 22 10:38:49 2013 -0700
tree	8de0b48acd6429db2b6a260e91c41173def0be80
parent	0669e986265ff8bb876ac44872b6002755e4a5cf [diff]