add reversed kmer option
git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@3301 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
index 51e5221..8c3f277 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
@@ -52,7 +52,7 @@
byte bitmap = tuple.getFieldData(1)[tuple.getFieldStart(1)];
byte count = tuple.getFieldData(2)[tuple.getFieldStart(2)];
- reEnterCount.reset(bitmap, count);
+ reEnterCount.set(bitmap, count);
reEnterKey.set(kmer, keyStart, keyLength);
writer.append(reEnterKey, reEnterCount);
// @mark: this method can not used for read in hadoop 0.20.2.
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
index 5764d3f..e5b7fa9 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
@@ -25,10 +25,12 @@
private int k;
private int byteNum;
+ private boolean bReversed;
- public ReadsKeyValueParserFactory(int k) {
+ public ReadsKeyValueParserFactory(int k, boolean bGenerateReversed) {
this.k = k;
byteNum = (byte) Math.ceil((double) k / 4.0);
+ bReversed = bGenerateReversed;
}
@Override
@@ -56,22 +58,41 @@
private void SplitReads(byte[] array, IFrameWriter writer) {
/** first kmer */
- byte[] kmer = Kmer.CompressKmer(k, array, 0);
+ byte[] kmer = Kmer.compressKmer(k, array, 0);
byte pre = 0;
byte next = GENE_CODE.getAdjBit(array[k]);
InsertToFrame(kmer, pre, next, writer);
/** middle kmer */
for (int i = k; i < array.length - 1; i++) {
- pre = Kmer.MoveKmer(k, kmer, array[i]);
+ pre = Kmer.moveKmer(k, kmer, array[i]);
next = GENE_CODE.getAdjBit(array[i + 1]);
InsertToFrame(kmer, pre, next, writer);
}
/** last kmer */
- pre = Kmer.MoveKmer(k, kmer, array[array.length - 1]);
+ pre = Kmer.moveKmer(k, kmer, array[array.length - 1]);
next = 0;
InsertToFrame(kmer, pre, next, writer);
+
+ if (bReversed) {
+ /** first kmer */
+ kmer = Kmer.compressKmerReverse(k, array, 0);
+ next = 0;
+ pre = GENE_CODE.getAdjBit(array[k]);
+ InsertToFrame(kmer, pre, next, writer);
+ /** middle kmer */
+ for (int i = k; i < array.length - 1; i++) {
+ next = Kmer.moveKmerReverse(k, kmer, array[i]);
+ pre = GENE_CODE.getAdjBit(array[i + 1]);
+ InsertToFrame(kmer, pre, next, writer);
+ }
+ /** last kmer */
+ next = Kmer.moveKmerReverse(k, kmer,
+ array[array.length - 1]);
+ pre = 0;
+ InsertToFrame(kmer, pre, next, writer);
+ }
}
private void InsertToFrame(byte[] kmer, byte pre, byte next,
@@ -105,7 +126,7 @@
@Override
public void open(IFrameWriter writer) throws HyracksDataException {
// TODO Auto-generated method stub
-
+
}
@Override
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
index 39f181a..0751707 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
@@ -21,6 +21,8 @@
public static final String GROUPBY_TYPE = "genomix.graph.groupby.type";
/** Graph outputformat */
public static final String OUTPUT_FORMAT = "genomix.graph.output";
+ /** Get reversed Kmer Sequence */
+ public static final String REVERSED_KMER = "genomix.kmer.reversed";
/** Configurations used by hybrid groupby function in graph build phrase */
public static final String GROUPBY_HYBRID_INPUTSIZE = "genomix.graph.groupby.hybrid.inputsize";
@@ -28,8 +30,8 @@
public static final String GROUPBY_HYBRID_RECORDSIZE_SINGLE = "genomix.graph.groupby.hybrid.recordsize.single";
public static final String GROUPBY_HYBRID_RECORDSIZE_CROSS = "genomix.graph.groupby.hybrid.recordsize.cross";
public static final String GROUPBY_HYBRID_HASHLEVEL = "genomix.graph.groupby.hybrid.hashlevel";
-
- public static final int DEFAULT_KMER= 55;
+
+ public static final int DEFAULT_KMER = 55;
public static final int DEFAULT_FRAME_SIZE = 32768;
public static final int DEFAULT_FRAME_LIMIT = 4096;
public static final int DEFAULT_TABLE_SIZE = 10485767;
@@ -38,10 +40,12 @@
public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_SINGLE = 9;
public static final int DEFAULT_GROUPBY_HYBRID_HASHLEVEL = 1;
public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS = 13;
-
- public static final String DEFAULT_GROUPBY_TYPE ="hybrid";
- public static final String DEFAULT_OUTPUT_FORMAT ="binary";
-
+
+ public static final boolean DEFAULT_REVERSED = false;
+
+ public static final String DEFAULT_GROUPBY_TYPE = "hybrid";
+ public static final String DEFAULT_OUTPUT_FORMAT = "binary";
+
public GenomixJob() throws IOException {
super(new Configuration());
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
index b8e4219..683c0a1 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
@@ -66,6 +66,7 @@
private int tableSize;
private GroupbyType groupbyType;
private OutputFormat outputFormat;
+ private boolean bGenerateReversedKmer;
private AbstractOperatorDescriptor singleGrouper;
private IConnectorDescriptor connPartition;
@@ -207,7 +208,8 @@
}
LOG.info("HDFS read schedule " + log);
return new HDFSReadOperatorDescriptor(jobSpec, readOutputRec, job,
- splits, readSchedule, new ReadsKeyValueParserFactory(kmers));
+ splits, readSchedule, new ReadsKeyValueParserFactory(kmers,
+ bGenerateReversedKmer));
} catch (Exception e) {
throw new HyracksDataException(e);
}
@@ -300,6 +302,9 @@
GenomixJob.GROUPBY_HYBRID_RECORDSIZE_CROSS,
GenomixJob.DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS);
+ bGenerateReversedKmer = conf.getBoolean(GenomixJob.REVERSED_KMER,
+ GenomixJob.DEFAULT_REVERSED);
+
String type = conf.get(GenomixJob.GROUPBY_TYPE,
GenomixJob.DEFAULT_GROUPBY_TYPE);
if (type.equalsIgnoreCase("external")) {
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
index 3e80ab7..f9bf5b0 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
@@ -45,6 +45,7 @@
+ HDFS_OUTPUT_PATH + "/merged.txt";
private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
private static final String EXPECTED_PATH = "src/test/resources/expected/result2";
+ private static final String EXPECTED_REVERSE_PATH = "src/test/resources/expected/result_reverse";
private static final String HYRACKS_APP_NAME = "genomix";
private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR
@@ -125,33 +126,59 @@
TestPreClusterGroupby();
cleanUpReEntry();
TestHybridGroupby();
+ cleanUpReEntry();
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ TestExternalReversedGroupby();
+ cleanUpReEntry();
+ TestPreClusterReversedGroupby();
+ cleanUpReEntry();
+ TestHybridReversedGroupby();
}
public void TestExternalGroupby() throws Exception {
conf.set(GenomixJob.GROUPBY_TYPE, "external");
- conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
System.err.println("Testing ExternalGroupBy");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults());
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
}
public void TestPreClusterGroupby() throws Exception {
conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
- conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
System.err.println("Testing PreClusterGroupBy");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults());
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
}
public void TestHybridGroupby() throws Exception {
conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
- conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
System.err.println("Testing HybridGroupBy");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults());
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
+
+ public void TestExternalReversedGroupby() throws Exception{
+ conf.set(GenomixJob.GROUPBY_TYPE, "external");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing ExternalGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
+ public void TestPreClusterReversedGroupby() throws Exception{
+ conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing PreclusterGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
+ public void TestHybridReversedGroupby() throws Exception{
+ conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing HybridGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
}
- private boolean checkResults() throws Exception {
+ private boolean checkResults(String expectedPath) throws Exception {
File dumped = null;
String format = conf.get(GenomixJob.OUTPUT_FORMAT);
if ("text".equalsIgnoreCase(format)) {
@@ -206,7 +233,7 @@
dumped = new File(CONVERT_RESULT);
}
- TestUtils.compareWithSortedResult(new File(EXPECTED_PATH), dumped);
+ TestUtils.compareWithSortedResult(new File(expectedPath), dumped);
return true;
}
diff --git a/genomix/genomix-hyracks/src/test/resources/expected/result_reverse b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
new file mode 100644
index 0000000..cf2712d
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
@@ -0,0 +1,8 @@
+AAGAT G|A 5
+AATAG |A 5
+AGAAG T| 5
+AGATA A|A 5
+ATAGA A|A 5
+GAAGA |T 5
+GATAA A| 5
+TAGAA A|G 5