clean code
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java
index e02b4f3..1b677c1 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingMapper.java
@@ -76,9 +76,7 @@
}
}
outputListAndKmer.set(tempPosList, kmer);
-// if(positionEntry.getReadID() == 1){
output.collect(positionEntry, outputListAndKmer);
-// }
}
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingReducer.java
index ec394fe..470b7fa 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/DeepGraphBuildingReducer.java
@@ -8,7 +8,6 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.lib.MultipleOutputs;
import edu.uci.ics.genomix.type.NodeWritable;
import edu.uci.ics.genomix.type.PositionListWritable;
import edu.uci.ics.genomix.type.PositionWritable;
@@ -44,14 +43,6 @@
public void reduce(PositionWritable key, Iterator<PositionListAndKmerWritable> values,
OutputCollector<NodeWritable, NullWritable> output, Reporter reporter) throws IOException {
int readID = key.getReadID();
- if(readID == 1) {
- int x = 4;
- int y =x ;
- System.out.println((int)key.getPosInRead());
- }
-/* while(values.hasNext()) {
- System.out.println(values.next().getKmer().toString());
- }*/
byte posInRead = (byte) 1;
resetNode(curNode, readID, posInRead);
assembleFirstTwoNodesInRead(curNodePosiListAndKmer, nextNodePosiListAndKmer, nextNodeNegaListAndKmer,
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java
index 1c68114..eaf2a6f 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphBuildingDriver.java
@@ -38,29 +38,29 @@
@Option(name = "-kmer-size", usage = "the size of kmer", required = true)
public int sizeKmer;
-
+
@Option(name = "-read-length", usage = "the length of read", required = true)
public int readLength;
@Option(name = "-onlytest1stjob", usage = "test", required = true)
- public boolean onlyTest1stJob;
+ public String onlyTest1stJob;
@Option(name = "-seq-output", usage = "sequence ouput format", required = true)
- public boolean seqOutput;
+ public String seqOutput;
}
- public void run(String inputPath, String outputPath, int numReducers, int sizeKmer, int readLength, boolean onlyTest1stJob,
- boolean seqOutput, String defaultConfPath) throws IOException {
+ public void run(String inputPath, String outputPath, int numReducers, int sizeKmer, int readLength,
+ boolean onlyTest1stJob, boolean seqOutput, String defaultConfPath) throws IOException {
if (onlyTest1stJob == true) {
runfirstjob(inputPath, numReducers, sizeKmer, readLength, seqOutput, defaultConfPath);
} else {
- runfirstjob(inputPath, 2, sizeKmer, readLength, true, defaultConfPath);
+ runfirstjob(inputPath, numReducers, sizeKmer, readLength, true, defaultConfPath);
runsecondjob(inputPath, outputPath, numReducers, sizeKmer, readLength, seqOutput, defaultConfPath);
}
}
- public void runfirstjob(String inputPath, int numReducers, int sizeKmer, int readLength, boolean seqOutput, String defaultConfPath)
- throws IOException {
+ public void runfirstjob(String inputPath, int numReducers, int sizeKmer, int readLength, boolean seqOutput,
+ String defaultConfPath) throws IOException {
JobConf conf = new JobConf(GraphBuildingDriver.class);
conf.setInt("sizeKmer", sizeKmer);
if (defaultConfPath != null) {
@@ -85,16 +85,18 @@
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(inputPath + "-step1"));
- if(numReducers == 0)
- conf.setNumReduceTasks(numReducers + 2);
+ if (numReducers == 0)
+ conf.setNumReduceTasks(numReducers + 2);
+ else
+ conf.setNumReduceTasks(numReducers);
FileSystem dfs = FileSystem.get(conf);
dfs.delete(new Path(inputPath + "-step1"), true);
JobClient.runJob(conf);
}
- public void runsecondjob(String inputPath, String outputPath, int numReducers, int sizeKmer, int readLength, boolean seqOutput,
- String defaultConfPath) throws IOException {
+ public void runsecondjob(String inputPath, String outputPath, int numReducers, int sizeKmer, int readLength,
+ boolean seqOutput, String defaultConfPath) throws IOException {
JobConf conf = new JobConf(GraphBuildingDriver.class);
if (defaultConfPath != null) {
conf.addResource(new Path(defaultConfPath));
@@ -102,7 +104,7 @@
conf.setJobName("deep build");
conf.setInt("sizeKmer", sizeKmer);
conf.setInt("readLength", readLength);
-
+
conf.setMapperClass(DeepGraphBuildingMapper.class);
conf.setReducerClass(DeepGraphBuildingReducer.class);
@@ -113,13 +115,13 @@
conf.setOutputKeyComparatorClass(PositionWritable.Comparator.class);
conf.setOutputValueGroupingComparator(PositionWritable.FirstComparator.class);
-
+
conf.setInputFormat(SequenceFileInputFormat.class);
if (seqOutput == true)
conf.setOutputFormat(SequenceFileOutputFormat.class);
else
conf.setOutputFormat(TextOutputFormat.class);
-
+
if (numReducers != 0) {
conf.setOutputKeyClass(NodeWritable.class);
conf.setOutputValueClass(NullWritable.class);
@@ -141,7 +143,17 @@
CmdLineParser parser = new CmdLineParser(options);
parser.parseArgument(args);
GraphBuildingDriver driver = new GraphBuildingDriver();
+ boolean onlyTest1stJob = true;
+ boolean seqOutput = true;
+ if (options.onlyTest1stJob.equals("true"))
+ onlyTest1stJob = true;
+ else
+ onlyTest1stJob = false;
+ if (options.seqOutput.equals("true"))
+ seqOutput = true;
+ else
+ seqOutput = false;
driver.run(options.inputPath, options.outputPath, options.numReducers, options.sizeKmer, options.readLength,
- options.onlyTest1stJob, options.seqOutput, null);
+ onlyTest1stJob, seqOutput, null);
}
}
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java
index a44dcd2..4eb7e28 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingMapper.java
@@ -1,6 +1,9 @@
package edu.uci.ics.genomix.hadoop.velvetgraphbuilding;
import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
@@ -37,33 +40,34 @@
}
int readID = 0;
readID = Integer.parseInt(rawLine[0]);
- if(readID == 6) {
- int x = 4;
- int y = x;
- }
String geneLine = rawLine[1];
- byte[] array = geneLine.getBytes();
- if (KMER_SIZE >= array.length) {
- throw new IOException("short read");
- }
- outputKmer.setByRead(array, 0);
- outputVertexID.set(readID, (byte) 1);
- output.collect(outputKmer, outputVertexID);
- /** middle kmer */
- for (int i = KMER_SIZE; i < array.length; i++) {
- outputKmer.shiftKmerWithNextChar(array[i]);
- outputVertexID.set(readID, (byte) (i - KMER_SIZE + 2));
+ Pattern genePattern = Pattern.compile("[AGCT]+");
+ Matcher geneMatcher = genePattern.matcher(geneLine);
+ boolean isValid = geneMatcher.matches();
+ if (isValid == true) {
+ byte[] array = geneLine.getBytes();
+ if (KMER_SIZE >= array.length) {
+ throw new IOException("short read");
+ }
+ outputKmer.setByRead(array, 0);
+ outputVertexID.set(readID, (byte) 1);
output.collect(outputKmer, outputVertexID);
- }
- /** reverse first kmer */
- outputKmer.setByReadReverse(array, 0);
- outputVertexID.set(readID, (byte) -1);
- output.collect(outputKmer, outputVertexID);
- /** reverse middle kmer */
- for (int i = KMER_SIZE; i < array.length; i++) {
- outputKmer.shiftKmerWithPreCode(GeneCode.getPairedCodeFromSymbol(array[i]));
- outputVertexID.set(readID, (byte)(KMER_SIZE - i - 2));
+ /** middle kmer */
+ for (int i = KMER_SIZE; i < array.length; i++) {
+ outputKmer.shiftKmerWithNextChar(array[i]);
+ outputVertexID.set(readID, (byte) (i - KMER_SIZE + 2));
+ output.collect(outputKmer, outputVertexID);
+ }
+ /** reverse first kmer */
+ outputKmer.setByReadReverse(array, 0);
+ outputVertexID.set(readID, (byte) -1);
output.collect(outputKmer, outputVertexID);
+ /** reverse middle kmer */
+ for (int i = KMER_SIZE; i < array.length; i++) {
+ outputKmer.shiftKmerWithPreCode(GeneCode.getPairedCodeFromSymbol(array[i]));
+ outputVertexID.set(readID, (byte) (KMER_SIZE - i - 2));
+ output.collect(outputKmer, outputVertexID);
+ }
}
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingReducer.java
index beba5ad..d2b6476 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/GraphInvertedIndexBuildingReducer.java
@@ -19,10 +19,6 @@
public void reduce(KmerBytesWritable key, Iterator<PositionWritable> values,
OutputCollector<KmerBytesWritable, PositionListWritable> output, Reporter reporter) throws IOException {
outputlist.reset();
- if(key.toString().equals("CTTCT")) {
- int x = 4;
- int y = x;
- }
while (values.hasNext()) {
outputlist.append(values.next());
}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/NewGraphBuildingTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/NewGraphBuildingTest.java
index 3e9a098..f6236d2 100644
--- a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/NewGraphBuildingTest.java
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/NewGraphBuildingTest.java
@@ -15,7 +15,6 @@
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
-import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Test;
@SuppressWarnings("deprecation")
@@ -32,8 +31,6 @@
private static final int COUNT_REDUCER = 2;
private static final int SIZE_KMER = 5;
private static final int READ_LENGTH = 8;
- private static final String GRAPHVIZ = "Graphviz";
- private static final String EXPECTED_OUPUT_KMER = EXPECTED_PATH + "result_after_kmerAggregate";
private MiniDFSCluster dfsCluster;
private MiniMRCluster mrCluster;
@@ -45,57 +42,15 @@
FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
startHadoop();
-// TestGroupbyKmer();
-// TestMapKmerToRead();
- TestGroupByReadID();
-/* SequenceFile.Reader reader = null;
- Path path = new Path(RESULT_PATH + "/part-00000");
- reader = new SequenceFile.Reader(dfs, path, conf);
- KmerBytesWritable key = new KmerBytesWritable(SIZE_KMER);
- KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
- File filePathTo = new File(TEST_SOURCE_DIR);
- FileUtils.forceMkdir(filePathTo);
- FileUtils.cleanDirectory(filePathTo);
- BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TEST_SOURCE_DIR + "/comparesource.txt")));
- File GraphViz = new File(GRAPHVIZ);
- FileUtils.forceMkdir(GraphViz);
- FileUtils.cleanDirectory(GraphViz);
- BufferedWriter bw2 = new BufferedWriter(new FileWriter(new File(GRAPHVIZ + "/GenomixSource.txt")));
-
- while (reader.next(key, value)) {
- byte succeed = (byte) 0x0F;
- byte adjBitMap = value.getAdjBitMap();
- succeed = (byte) (succeed & adjBitMap);
- byte shiftedCode = 0;
- for(int i = 0 ; i < 4; i ++){
- byte temp = 0x01;
- temp = (byte)(temp << i);
- temp = (byte) (succeed & temp);
- if(temp != 0 ){
- bw2.write(key.toString());
- bw2.newLine();
- byte succeedCode = GeneCode.getGeneCodeFromBitMap(temp);
- shiftedCode = key.shiftKmerWithNextCode(succeedCode);
- bw2.write(key.toString());
- bw2.newLine();
- key.shiftKmerWithPreCode(shiftedCode);
- }
- }
- bw.write(key.toString() + "\t" + value.toString());
- bw.newLine();
- }
- bw2.close();
- bw.close();*/
-
+ TestGroupbyKmer();
+ TestMapKmerToRead();
cleanupHadoop();
-
}
public void TestGroupbyKmer() throws Exception {
GraphBuildingDriver tldriver = new GraphBuildingDriver();
tldriver.run(HDFS_PATH, RESULT_PATH, COUNT_REDUCER, SIZE_KMER, READ_LENGTH, true, false, HADOOP_CONF_PATH);
dumpGroupByKmerResult();
-// TestUtils.compareWithResult(new File(ACTUAL_RESULT_DIR + HDFS_PATH + "-step1" + "/part-00000"), new File(EXPECTED_OUPUT_KMER));
}
public void TestMapKmerToRead() throws Exception {