modify the driver of velvet_graphbuilding
diff --git a/genomix/genomix-hadoop/data/webmap/Test.txt b/genomix/genomix-hadoop/data/webmap/Test.txt
deleted file mode 100755
index 6d02b25..0000000
--- a/genomix/genomix-hadoop/data/webmap/Test.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-@625E1AAXX100810:1:100:10000:10271/1
-AGCATCGCA
-+
-EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
-@625E1AAXX100810:1:100:10000:10271/1
-TGCATCGCT
-+
-EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
-
-
diff --git a/genomix/genomix-hadoop/data/webmap/text.txt b/genomix/genomix-hadoop/data/webmap/text.txt
new file mode 100755
index 0000000..13190dd
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/text.txt
@@ -0,0 +1,6 @@
+1 AATAGAAG
+2 AATAGAAG
+3 AATAGAAG
+4 AATAGAAG
+5 AATAGAAG
+6 AGAAGAAG
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java
index c3c252a..7f9e995 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java
@@ -1,7 +1,6 @@
package edu.uci.ics.genomix.hadoop.velvetgraphbuilding;
import java.io.IOException;
-import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java
index acbc3f1..68ce1c2 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java
@@ -39,11 +39,25 @@
@Option(name = "-kmer-size", usage = "the size of kmer", required = true)
public int sizeKmer;
- }
-
- public void run(String inputPath, String outputPath, int numReducers, int sizeKmer, String defaultConfPath)
- throws IOException {
+ @Option(name = "-onlytest1stjob", usage = "test", required = true)
+ public boolean onlyTest1stJob;
+ @Option(name = "-seq-output", usage = "sequence ouput format", required = true)
+ public boolean seqOutput;
+ }
+
+ public void run(String inputPath, String outputPath, int numReducers, int sizeKmer, boolean onlyTest1stJob,
+ boolean seqOutput, String defaultConfPath) throws IOException {
+ if (onlyTest1stJob == true) {
+ runfirstjob(inputPath, numReducers, sizeKmer, seqOutput, defaultConfPath);
+ } else {
+ runfirstjob(inputPath, numReducers, sizeKmer, false, defaultConfPath);
+ runsecondjob(inputPath, outputPath, numReducers, sizeKmer, seqOutput, defaultConfPath);
+ }
+ }
+
+ public void runfirstjob(String inputPath, int numReducers, int sizeKmer, boolean seqOutput, String defaultConfPath)
+ throws IOException {
JobConf conf = new JobConf(GraphBuildingDriver.class);
conf.setInt("sizeKmer", sizeKmer);
if (defaultConfPath != null) {
@@ -58,11 +72,14 @@
conf.setMapOutputValueClass(PositionWritable.class);
conf.setInputFormat(TextInputFormat.class);
- conf.setOutputFormat(SequenceFileOutputFormat.class);
-
+ if (seqOutput == true)
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+ else
+ conf.setOutputFormat(TextOutputFormat.class);
+
conf.setOutputKeyClass(KmerBytesWritable.class);
conf.setOutputValueClass(PositionListWritable.class);
-
+
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(inputPath + "-step1"));
conf.setNumReduceTasks(numReducers);
@@ -70,34 +87,40 @@
FileSystem dfs = FileSystem.get(conf);
dfs.delete(new Path(inputPath + "-step1"), true);
JobClient.runJob(conf);
-
- //-------------
- conf = new JobConf(GraphBuildingDriver.class);
+ }
+
+ public void runsecondjob(String inputPath, String outputPath, int numReducers, int sizeKmer, boolean seqOutput,
+ String defaultConfPath) throws IOException {
+ JobConf conf = new JobConf(GraphBuildingDriver.class);
if (defaultConfPath != null) {
conf.addResource(new Path(defaultConfPath));
}
conf.setJobName("deep build");
-
+
conf.setMapperClass(DeepGraphBuildingMapper.class);
conf.setReducerClass(DeepGraphBuildingReducer.class);
-
+
conf.setMapOutputKeyClass(PositionWritable.class);
conf.setMapOutputValueClass(PositionListAndKmerWritable.class);
-
+
conf.setPartitionerClass(ReadIDPartitioner.class);
-
+
conf.setOutputKeyComparatorClass(PositionWritable.Comparator.class);
conf.setOutputValueGroupingComparator(PositionWritable.FirstComparator.class);
-
+
conf.setInputFormat(SequenceFileInputFormat.class);
- conf.setOutputFormat(TextOutputFormat.class);
-
+ if (seqOutput == true)
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+ else
+ conf.setOutputFormat(TextOutputFormat.class);
+
conf.setOutputKeyClass(NodeWritable.class);
conf.setOutputValueClass(NullWritable.class);
-
+
FileInputFormat.setInputPaths(conf, new Path(inputPath + "-step1"));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
conf.setNumReduceTasks(1);
+ FileSystem dfs = FileSystem.get(conf);
dfs.delete(new Path(outputPath), true);
JobClient.runJob(conf);
}
@@ -107,6 +130,7 @@
CmdLineParser parser = new CmdLineParser(options);
parser.parseArgument(args);
GraphBuildingDriver driver = new GraphBuildingDriver();
- driver.run(options.inputPath, options.outputPath, options.numReducers, options.sizeKmer, null);
+ driver.run(options.inputPath, options.outputPath, options.numReducers, options.sizeKmer,
+ options.onlyTest1stJob, options.seqOutput, null);
}
}
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java
index d5924c8..9592a59 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java
@@ -15,7 +15,7 @@
@SuppressWarnings("deprecation")
public class GraphInvertedIndexBuildingMapper extends MapReduceBase implements
Mapper<LongWritable, Text, KmerBytesWritable, PositionWritable> {
-
+
public static int KMER_SIZE;
public PositionWritable outputVertexID;
public KmerBytesWritable outputKmer;
@@ -26,22 +26,26 @@
outputVertexID = new PositionWritable();
outputKmer = new KmerBytesWritable(KMER_SIZE);
}
+
@Override
public void map(LongWritable key, Text value, OutputCollector<KmerBytesWritable, PositionWritable> output,
Reporter reporter) throws IOException {
- String geneLine = value.toString();
/** first kmer */
+ String[] rawLine = value.toString().split("\\t"); // Read the Real Gene Line
+ if (rawLine.length != 2) {
+ throw new IOException("invalid data");
+ }
+ int readID = 0;
+ readID = Integer.parseInt(rawLine[0]);
+ String geneLine = rawLine[1];
byte[] array = geneLine.getBytes();
outputKmer.setByRead(array, 0);
- System.out.println(key.get());
- outputVertexID.set((int)key.get(), (byte)0);
+ outputVertexID.set(readID, (byte) 0);
output.collect(outputKmer, outputVertexID);
/** middle kmer */
- int i = 0;
- for (i = KMER_SIZE; i < array.length; i++) {
+ for (int i = KMER_SIZE; i < array.length; i++) {
outputKmer.shiftKmerWithNextChar(array[i]);
- System.out.println((int)key.get());
- outputVertexID.set((int)key.get(), (byte)(i - KMER_SIZE + 1));
+ outputVertexID.set(readID, (byte) (i - KMER_SIZE + 1));
output.collect(outputKmer, outputVertexID);
}
}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/NewGraphBuildingTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/NewGraphBuildingTest.java
new file mode 100644
index 0000000..18f95b3
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/NewGraphBuildingTest.java
@@ -0,0 +1,136 @@
+package edu.uci.ics.genomix.hadoop.velvetgraphbuilding;
+
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import junit.framework.Assert;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MiniMRCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.Test;
+@SuppressWarnings("deprecation")
+
+public class NewGraphBuildingTest {
+
+ private JobConf conf = new JobConf();
+ private static final String ACTUAL_RESULT_DIR = "actual1";
+ private static final String COMPARE_DIR = "compare";
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private static final String DATA_PATH = "data/webmap/Test.txt";
+ private static final String HDFS_PATH = "/webmap";
+ private static final String RESULT_PATH = "/result1";
+ private static final String EXPECTED_PATH = "expected/result1";
+ private static final String TEST_SOURCE_DIR = COMPARE_DIR + RESULT_PATH;
+ private static final int COUNT_REDUCER = 4;
+ private static final int SIZE_KMER = 3;
+ private static final String GRAPHVIZ = "Graphviz";
+
+ private MiniDFSCluster dfsCluster;
+ private MiniMRCluster mrCluster;
+ private FileSystem dfs;
+
+ @SuppressWarnings("resource")
+ @Test
+ public void test() throws Exception {
+ startHadoop();
+ TestGroupbyKmer();
+
+/* SequenceFile.Reader reader = null;
+ Path path = new Path(RESULT_PATH + "/part-00000");
+ reader = new SequenceFile.Reader(dfs, path, conf);
+ KmerBytesWritable key = new KmerBytesWritable(SIZE_KMER);
+ KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+ File filePathTo = new File(TEST_SOURCE_DIR);
+ FileUtils.forceMkdir(filePathTo);
+ FileUtils.cleanDirectory(filePathTo);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TEST_SOURCE_DIR + "/comparesource.txt")));
+ File GraphViz = new File(GRAPHVIZ);
+ FileUtils.forceMkdir(GraphViz);
+ FileUtils.cleanDirectory(GraphViz);
+ BufferedWriter bw2 = new BufferedWriter(new FileWriter(new File(GRAPHVIZ + "/GenomixSource.txt")));
+
+ while (reader.next(key, value)) {
+ byte succeed = (byte) 0x0F;
+ byte adjBitMap = value.getAdjBitMap();
+ succeed = (byte) (succeed & adjBitMap);
+ byte shiftedCode = 0;
+ for(int i = 0 ; i < 4; i ++){
+ byte temp = 0x01;
+ temp = (byte)(temp << i);
+ temp = (byte) (succeed & temp);
+ if(temp != 0 ){
+ bw2.write(key.toString());
+ bw2.newLine();
+ byte succeedCode = GeneCode.getGeneCodeFromBitMap(temp);
+ shiftedCode = key.shiftKmerWithNextCode(succeedCode);
+ bw2.write(key.toString());
+ bw2.newLine();
+ key.shiftKmerWithPreCode(shiftedCode);
+ }
+ }
+ bw.write(key.toString() + "\t" + value.toString());
+ bw.newLine();
+ }
+ bw2.close();
+ bw.close();*/
+
+
+// TestUtils.compareWithResult(new File(TEST_SOURCE_DIR + "/comparesource.txt"), new File(EXPECTED_PATH));
+ cleanupHadoop();
+
+ }
+
+ public void TestGroupbyKmer() throws Exception {
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ GraphBuildingDriver tldriver = new GraphBuildingDriver();
+ tldriver.run(HDFS_PATH, RESULT_PATH, COUNT_REDUCER, SIZE_KMER, true, false, HADOOP_CONF_PATH);
+ dumpResult();
+ }
+
+ public void TestMapKmerToRead() throws Exception {
+
+ }
+
+ public void TestGroupByReadID() throws Exception {
+
+ }
+ private void startHadoop() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, 2, true, null);
+ dfs = dfsCluster.getFileSystem();
+ mrCluster = new MiniMRCluster(4, dfs.getUri().toString(), 2);
+
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_PATH + "/");
+ dfs.mkdirs(dest);
+ dfs.copyFromLocalFile(src, dest);
+
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ private void cleanupHadoop() throws IOException {
+ mrCluster.shutdown();
+ dfsCluster.shutdown();
+ }
+
+ private void dumpResult() throws IOException {
+ Path src = new Path(RESULT_PATH);
+ Path dest = new Path(ACTUAL_RESULT_DIR);
+ dfs.copyToLocalFile(src, dest);
+ }
+}