add reversed kmer option

git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@3301 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
index 51e5221..8c3f277 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
@@ -52,7 +52,7 @@
 
 				byte bitmap = tuple.getFieldData(1)[tuple.getFieldStart(1)];
 				byte count = tuple.getFieldData(2)[tuple.getFieldStart(2)];
-				reEnterCount.reset(bitmap, count);
+				reEnterCount.set(bitmap, count);
 				reEnterKey.set(kmer, keyStart, keyLength);
 				writer.append(reEnterKey, reEnterCount);
 				// @mark: this method can not used for read in hadoop 0.20.2.
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
index 5764d3f..e5b7fa9 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
@@ -25,10 +25,12 @@
 

 	private int k;

 	private int byteNum;

+	private boolean bReversed;

 

-	public ReadsKeyValueParserFactory(int k) {

+	public ReadsKeyValueParserFactory(int k, boolean bGenerateReversed) {

 		this.k = k;

 		byteNum = (byte) Math.ceil((double) k / 4.0);

+		bReversed = bGenerateReversed;

 	}

 

 	@Override

@@ -56,22 +58,41 @@
 

 			private void SplitReads(byte[] array, IFrameWriter writer) {

 				/** first kmer */

-				byte[] kmer = Kmer.CompressKmer(k, array, 0);

+				byte[] kmer = Kmer.compressKmer(k, array, 0);

 				byte pre = 0;

 				byte next = GENE_CODE.getAdjBit(array[k]);

 				InsertToFrame(kmer, pre, next, writer);

 

 				/** middle kmer */

 				for (int i = k; i < array.length - 1; i++) {

-					pre = Kmer.MoveKmer(k, kmer, array[i]);

+					pre = Kmer.moveKmer(k, kmer, array[i]);

 					next = GENE_CODE.getAdjBit(array[i + 1]);

 					InsertToFrame(kmer, pre, next, writer);

 

 				}

 				/** last kmer */

-				pre = Kmer.MoveKmer(k, kmer, array[array.length - 1]);

+				pre = Kmer.moveKmer(k, kmer, array[array.length - 1]);

 				next = 0;

 				InsertToFrame(kmer, pre, next, writer);

+

+				if (bReversed) {

+					/** first kmer */

+					kmer = Kmer.compressKmerReverse(k, array, 0);

+					next = 0;

+					pre = GENE_CODE.getAdjBit(array[k]);

+					InsertToFrame(kmer, pre, next, writer);

+					/** middle kmer */

+					for (int i = k; i < array.length - 1; i++) {

+						next = Kmer.moveKmerReverse(k, kmer, array[i]);

+						pre = GENE_CODE.getAdjBit(array[i + 1]);

+						InsertToFrame(kmer, pre, next, writer);

+					}

+					/** last kmer */

+					next = Kmer.moveKmerReverse(k, kmer,

+							array[array.length - 1]);

+					pre = 0;

+					InsertToFrame(kmer, pre, next, writer);

+				}

 			}

 

 			private void InsertToFrame(byte[] kmer, byte pre, byte next,

@@ -105,7 +126,7 @@
 			@Override

 			public void open(IFrameWriter writer) throws HyracksDataException {

 				// TODO Auto-generated method stub

-				

+

 			}

 

 			@Override

diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
index 39f181a..0751707 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
@@ -21,6 +21,8 @@
 	public static final String GROUPBY_TYPE = "genomix.graph.groupby.type";
 	/** Graph outputformat */
 	public static final String OUTPUT_FORMAT = "genomix.graph.output";
+	/** Get reversed Kmer Sequence */
+	public static final String REVERSED_KMER = "genomix.kmer.reversed";
 
 	/** Configurations used by hybrid groupby function in graph build phrase */
 	public static final String GROUPBY_HYBRID_INPUTSIZE = "genomix.graph.groupby.hybrid.inputsize";
@@ -28,8 +30,8 @@
 	public static final String GROUPBY_HYBRID_RECORDSIZE_SINGLE = "genomix.graph.groupby.hybrid.recordsize.single";
 	public static final String GROUPBY_HYBRID_RECORDSIZE_CROSS = "genomix.graph.groupby.hybrid.recordsize.cross";
 	public static final String GROUPBY_HYBRID_HASHLEVEL = "genomix.graph.groupby.hybrid.hashlevel";
-	
-	public static final int DEFAULT_KMER= 55;
+
+	public static final int DEFAULT_KMER = 55;
 	public static final int DEFAULT_FRAME_SIZE = 32768;
 	public static final int DEFAULT_FRAME_LIMIT = 4096;
 	public static final int DEFAULT_TABLE_SIZE = 10485767;
@@ -38,10 +40,12 @@
 	public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_SINGLE = 9;
 	public static final int DEFAULT_GROUPBY_HYBRID_HASHLEVEL = 1;
 	public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS = 13;
-	
-	public static final String DEFAULT_GROUPBY_TYPE ="hybrid";
-	public static final String DEFAULT_OUTPUT_FORMAT ="binary";
-	
+
+	public static final boolean DEFAULT_REVERSED = false;
+
+	public static final String DEFAULT_GROUPBY_TYPE = "hybrid";
+	public static final String DEFAULT_OUTPUT_FORMAT = "binary";
+
 	public GenomixJob() throws IOException {
 		super(new Configuration());
 	}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
index b8e4219..683c0a1 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
@@ -66,6 +66,7 @@
 	private int tableSize;
 	private GroupbyType groupbyType;
 	private OutputFormat outputFormat;
+	private boolean bGenerateReversedKmer;
 
 	private AbstractOperatorDescriptor singleGrouper;
 	private IConnectorDescriptor connPartition;
@@ -207,7 +208,8 @@
 			}
 			LOG.info("HDFS read schedule " + log);
 			return new HDFSReadOperatorDescriptor(jobSpec, readOutputRec, job,
-					splits, readSchedule, new ReadsKeyValueParserFactory(kmers));
+					splits, readSchedule, new ReadsKeyValueParserFactory(kmers,
+							bGenerateReversedKmer));
 		} catch (Exception e) {
 			throw new HyracksDataException(e);
 		}
@@ -300,6 +302,9 @@
 				GenomixJob.GROUPBY_HYBRID_RECORDSIZE_CROSS,
 				GenomixJob.DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS);
 
+		bGenerateReversedKmer = conf.getBoolean(GenomixJob.REVERSED_KMER,
+				GenomixJob.DEFAULT_REVERSED);
+
 		String type = conf.get(GenomixJob.GROUPBY_TYPE,
 				GenomixJob.DEFAULT_GROUPBY_TYPE);
 		if (type.equalsIgnoreCase("external")) {
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
index 3e80ab7..f9bf5b0 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
@@ -45,6 +45,7 @@
 			+ HDFS_OUTPUT_PATH + "/merged.txt";
 	private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
 	private static final String EXPECTED_PATH = "src/test/resources/expected/result2";
+	private static final String EXPECTED_REVERSE_PATH = "src/test/resources/expected/result_reverse";
 
 	private static final String HYRACKS_APP_NAME = "genomix";
 	private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR
@@ -125,33 +126,59 @@
 		TestPreClusterGroupby();
 		cleanUpReEntry();
 		TestHybridGroupby();
+		cleanUpReEntry();
+		conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+		TestExternalReversedGroupby();
+		cleanUpReEntry();
+		TestPreClusterReversedGroupby();
+		cleanUpReEntry();
+		TestHybridReversedGroupby();
 	}
 	
 	public void TestExternalGroupby() throws Exception {
 		conf.set(GenomixJob.GROUPBY_TYPE, "external");
-		conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
 		System.err.println("Testing ExternalGroupBy");
 		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
-		Assert.assertEquals(true, checkResults());
+		Assert.assertEquals(true, checkResults(EXPECTED_PATH));
 	}
 
 	public void TestPreClusterGroupby() throws Exception {
 		conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
-		conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
 		System.err.println("Testing PreClusterGroupBy");
 		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
-		Assert.assertEquals(true, checkResults());
+		Assert.assertEquals(true, checkResults(EXPECTED_PATH));
 	}
 
 	public void TestHybridGroupby() throws Exception {
 		conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
-		conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
 		System.err.println("Testing HybridGroupBy");
 		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
-		Assert.assertEquals(true, checkResults());
+		Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+	}
+	
+	public void TestExternalReversedGroupby() throws Exception{
+		conf.set(GenomixJob.GROUPBY_TYPE, "external");
+		conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+		System.err.println("Testing ExternalGroupBy + Reversed");
+		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+		Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+	}
+	public void TestPreClusterReversedGroupby() throws Exception{
+		conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+		conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+		System.err.println("Testing PreclusterGroupBy + Reversed");
+		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+		Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+	}
+	public void TestHybridReversedGroupby() throws Exception{
+		conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+		conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+		System.err.println("Testing HybridGroupBy + Reversed");
+		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+		Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
 	}
 
-	private boolean checkResults() throws Exception {
+	private boolean checkResults(String expectedPath) throws Exception {
 		File dumped = null;
 		String format = conf.get(GenomixJob.OUTPUT_FORMAT);
 		if ("text".equalsIgnoreCase(format)) {
@@ -206,7 +233,7 @@
 			dumped = new File(CONVERT_RESULT);
 		}
 
-		TestUtils.compareWithSortedResult(new File(EXPECTED_PATH), dumped);
+		TestUtils.compareWithSortedResult(new File(expectedPath), dumped);
 		return true;
 	}
 
diff --git a/genomix/genomix-hyracks/src/test/resources/expected/result_reverse b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
new file mode 100644
index 0000000..cf2712d
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
@@ -0,0 +1,8 @@
+AAGAT	G|A	5
+AATAG	|A	5
+AGAAG	T|	5
+AGATA	A|A	5
+ATAGA	A|A	5
+GAAGA	|T	5
+GATAA	A|	5
+TAGAA	A|G	5