text output test pass

git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@2961 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java
index 3a53ef2..d8f0122 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java
@@ -150,7 +150,7 @@
 				try {

 					byte adj = GENE_CODE.mergePreNextAdj(pre, next);

 					tupleBuilder.reset();

-					tupleBuilder.addField(kmer, 0, byteNum + 1);

+					tupleBuilder.addField(kmer, 0, byteNum);

 					tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE,

 							adj);

 

diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
index 441b798..332b7d7 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
@@ -26,16 +26,16 @@
 		public void write(DataOutput output, ITupleReference tuple)
 				throws HyracksDataException {
 			try {
-				output.writeChars(Kmer.recoverKmerFrom(KMER,
+				output.write(Kmer.recoverKmerFrom(KMER,
 						tuple.getFieldData(0), tuple.getFieldStart(0),
-						tuple.getFieldLength(0)));
-				output.writeChar('\t');
-				output.writeChars(Kmer.recoverAdjacent(tuple
-						.getFieldData(1)[tuple.getFieldStart(1)]));
-				output.writeChar('\t');
-				output.writeInt((int)tuple
-						.getFieldData(2)[tuple.getFieldStart(2)]);
-				output.writeChar('\n');
+						tuple.getFieldLength(0)).getBytes());
+				output.writeByte('\t');
+				output.write(Kmer.GENE_CODE.getSymbolFromBitMap(tuple
+						.getFieldData(1)[tuple.getFieldStart(1)]).getBytes());
+				output.writeByte('\t');
+				output.write(String.valueOf((int)tuple
+						.getFieldData(2)[tuple.getFieldStart(2)]).getBytes());
+				output.writeByte('\n');
 			} catch (IOException e) {
 				throw new HyracksDataException(e);
 			}
diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
index 1b13760..bc65204 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
@@ -86,7 +86,7 @@
 				try {

 					byte adj = GENE_CODE.mergePreNextAdj(pre, next);

 					tupleBuilder.reset();

-					tupleBuilder.addField(kmer, 0, byteNum + 1);

+					tupleBuilder.addField(kmer, 0, byteNum);

 					tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE,

 							adj);

 

diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java
index 19d754d..8517432 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java
@@ -1,6 +1,5 @@
 package edu.uci.ics.genomix.type;
 
-
 public class Kmer {
 
 	public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
@@ -8,7 +7,8 @@
 	public final static class GENE_CODE {
 
 		/**
-		 * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL}
+		 * make sure this 4 ids equal to the sequence id of char in
+		 * {@GENE_SYMBOL}
 		 */
 		public static final byte A = 0;
 		public static final byte C = 1;
@@ -67,24 +67,24 @@
 			}
 			return r;
 		}
-		
-		public static byte mergePreNextAdj(byte pre, byte next){
+
+		public static byte mergePreNextAdj(byte pre, byte next) {
 			return (byte) (pre << 4 | next & 0x0f);
 		}
-		
+
 		public static String getSymbolFromBitMap(byte code) {
 			int left = (code >> 4) & 0x0F;
 			int right = code & 0x0F;
 			String str = new String();
-			for(int i = A; i <= T ; i++){
-				if ( (left & (1<<i)) != 0){
-					str += GENE_SYMBOL[i];
+			for (int i = A; i <= T; i++) {
+				if ((left & (1 << i)) != 0) {
+					str += (char)GENE_SYMBOL[i];
 				}
 			}
 			str += '|';
-			for(int i = A; i <= T ; i++){
-				if ( (right & (1<<i)) != 0){
-					str += GENE_SYMBOL[i];
+			for (int i = A; i <= T; i++) {
+				if ((right & (1 << i)) != 0) {
+					str += (char)GENE_SYMBOL[i];
 				}
 			}
 			return str;
@@ -94,94 +94,106 @@
 	public static String recoverKmerFrom(int k, byte[] keyData, int keyStart,
 			int keyLength) {
 		String strKmer = new String();
-		int byteId = keyStart + keyLength-1;
+		int byteId = keyStart + keyLength - 1;
 		byte currentbyte = keyData[byteId];
-		for(int geneCount = 0; geneCount < k ; geneCount++){
-			if (geneCount % 4 == 0 && geneCount > 0){
+		for (int geneCount = 0; geneCount < k; geneCount++) {
+			if (geneCount % 4 == 0 && geneCount > 0) {
 				currentbyte = keyData[--byteId];
 			}
-			strKmer += (char)GENE_SYMBOL[(currentbyte >> ((geneCount%4)*2)) & 0x03];
+			strKmer += (char) GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03];
 		}
 		return strKmer;
 	}
 
-	public static String recoverAdjacent(byte number) {
-		int incoming = (number & 0xF0) >> 4;
-		int outgoing = number & 0x0F;
-		return String.valueOf(incoming) + '|' + String.valueOf(outgoing);
-	}
-
 	/**
-	 * Compress Kmer into bytes array
-	 * AATAG will compress as [0 0 0 G][A T A A]
-	 * @param kmer 
-	 * @param input array
-	 * @param start position
+	 * Compress Kmer into bytes array AATAG will compress as [0 0 0 G][A T A A]
+	 * 
+	 * @param kmer
+	 * @param input
+	 *            array
+	 * @param start
+	 *            position
 	 * @return initialed kmer array
 	 */
 	public static byte[] CompressKmer(int k, byte[] array, int start) {
 		final int byteNum = (byte) Math.ceil((double) k / 4.0);
-		byte[] bytes = new byte[byteNum ];
+		byte[] bytes = new byte[byteNum];
 
 		byte l = 0;
 		int bytecount = 0;
-		int bcount = byteNum-1;
-		for (int i = start; i<start +k; i++){
+		int bcount = byteNum - 1;
+		for (int i = start; i < start + k; i++) {
 			byte code = GENE_CODE.getCodeFromSymbol(array[i]);
 			l |= (byte) (code << bytecount);
-			bytecount +=2;
-			if (bytecount == 8){
+			bytecount += 2;
+			if (bytecount == 8) {
 				bytes[bcount--] = l;
 				l = 0;
-				bytecount= 0;
+				bytecount = 0;
 			}
 		}
-		if (bcount >= 0){
-			bytes[0]=l;
+		if (bcount >= 0) {
+			bytes[0] = l;
 		}
 		return bytes;
 	}
 
 	/**
 	 * Shift Kmer to accept new input
+	 * 
 	 * @param kmer
-	 * @param bytes Kmer Array
-	 * @param c Input new gene character
+	 * @param bytes
+	 *            Kmer Array
+	 * @param c
+	 *            Input new gene character
 	 * @return the shiftout gene, in gene code format
 	 */
 	public static byte MoveKmer(int k, byte[] kmer, byte c) {
 		int byteNum = (byte) Math.ceil((double) k / 4.0);
-		byte output = (byte) (kmer[byteNum-1] & 0x03);
-		for(int i = byteNum-1; i >0; i--){
-			byte in = (byte) (kmer[i-1] & 0x03);
-			kmer[i] = (byte) ((kmer[i] >>> 2) | (in << 6));
+		byte output = (byte) (kmer[byteNum - 1] & 0x03);
+		for (int i = byteNum - 1; i > 0; i--) {
+			byte in = (byte) (kmer[i - 1] & 0x03);
+			kmer[i] = (byte) (((kmer[i] >>> 2) & 0x3f) | (in << 6));
 		}
-		
-		int pos = ((k-1) % 4) *2;
+
+		int pos = ((k - 1) % 4) * 2;
 		byte code = (byte) (GENE_CODE.getCodeFromSymbol(c) << pos);
 		kmer[0] = (byte) ((kmer[0] >>> 2) | code);
-		return output;
+		return (byte) (1 << output);
 	}
 
-	public static void main(String [] argv){
-		byte[] array = {'A','A','T','A','G','C','A','G'};
+	public static void main(String[] argv) {
+		byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
 		int k = 5;
 		byte[] kmer = CompressKmer(k, array, 0);
-		for (byte b : kmer){
-			System.out.print((int)b);
-			System.out.print(' ');
-		}
-		System.out.println();
-		System.out.println(recoverKmerFrom(k,kmer,0,kmer.length));
-		
-		byte out = MoveKmer(k, kmer, array[k]);
-		
-		System.out.println((int)out);
-		for (byte b : kmer){
+		for (byte b : kmer) {
 			System.out.print(Integer.toBinaryString(b));
 			System.out.print(' ');
 		}
 		System.out.println();
-		System.out.println(recoverKmerFrom(k,kmer,0,kmer.length));
+		System.out.println(recoverKmerFrom(k, kmer, 0, kmer.length));
+
+		for (int i = k; i < array.length-1; i++) {
+			byte out = MoveKmer(k, kmer, array[i]);
+
+			System.out.println((int) out);
+			for (byte b : kmer) {
+				System.out.print(Integer.toBinaryString(b));
+				System.out.print(' ');
+			}
+			System.out.println();
+			System.out.println(recoverKmerFrom(k, kmer, 0, kmer.length));
+		}
+
+		byte out = MoveKmer(k, kmer, array[array.length - 1]);
+
+		System.out.println((int) out);
+		for (byte b : kmer) {
+			System.out.print(Integer.toBinaryString(b));
+			System.out.print(' ');
+		}
+		System.out.println();
+		System.out.println(recoverKmerFrom(k, kmer, 0, kmer.length));
+
 	}
 }
diff --git a/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java b/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java
index 07c2e1d..c185d63 100644
--- a/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java
+++ b/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java
@@ -17,6 +17,7 @@
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.JobConf;
@@ -28,6 +29,7 @@
 import edu.uci.ics.genomix.driver.Driver;
 import edu.uci.ics.genomix.driver.Driver.Plan;
 import edu.uci.ics.genomix.job.GenomixJob;
+import edu.uci.ics.genomix.type.Kmer;
 import edu.uci.ics.genomix.type.KmerCountValue;
 import edu.uci.ics.hyracks.hdfs.utils.HyracksUtils;
 import edu.uci.ics.hyracks.hdfs.utils.TestUtils;
@@ -38,7 +40,8 @@
 
 	private static final String DATA_PATH = "src/test/resources/data/webmap/text.txt";
 	private static final String HDFS_INPUT_PATH = "/webmap";
-	private static final String HDFS_OUTPUT_PATH = "/webmap_result/";
+	private static final String HDFS_OUTPUT_PATH = "/webmap_result";
+	private static final String HDFS_OUTPUT_FILE = HDFS_OUTPUT_PATH + "/part-0";
 
 	private static final String DUMPED_RESULT = ACTUAL_RESULT_DIR
 			+ HDFS_OUTPUT_PATH + "/merged.txt";
@@ -125,19 +128,20 @@
 		Assert.assertEquals(true, checkResults());
 	}
 
-	// @Test
+//	@Test
 	public void TestPreClusterGroupby() throws Exception {
 		cleanUpReEntry();
 		conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+		conf.set(GenomixJob.OUTPUT_FORMAT, "text");
 		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
 		Assert.assertEquals(true, checkResults());
 	}
 
-	// @Test
+//	@Test
 	public void TestHybridGroupby() throws Exception {
 		cleanUpReEntry();
 		conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
-		conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
+		conf.set(GenomixJob.OUTPUT_FORMAT, "text");
 		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
 		Assert.assertEquals(true, checkResults());
 	}
@@ -146,23 +150,27 @@
 		FileUtil.copyMerge(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH),
 				FileSystem.getLocal(new Configuration()), new Path(
 						DUMPED_RESULT), false, conf, null);
-		
-        SequenceFile.Reader reader = null;
-        Path path = new Path(DUMPED_RESULT);
-        FileSystem dfs = FileSystem.get(conf);
-        reader = new SequenceFile.Reader(dfs, path, conf);
-        BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
-        KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
-        File filePathTo = new File(CONVERT_RESULT);
-        BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
-        while (reader.next(key, value)) {
-            bw.write(key + "\t" + value.toString());
-            bw.newLine();
-        }
-        bw.close();
+		File dumped = new File( DUMPED_RESULT);
+		String format = conf.get(GenomixJob.OUTPUT_FORMAT); 
+		if( !"text".equalsIgnoreCase(format)){
+	        SequenceFile.Reader reader = null;
+	        Path path = new Path(HDFS_OUTPUT_FILE);
+	        FileSystem dfs = FileSystem.get(conf);
+	        reader = new SequenceFile.Reader(dfs, path, conf);
+	        BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+	        KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+	        File filePathTo = new File(CONVERT_RESULT);
+	        BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+	        int k = conf.getInt(GenomixJob.KMER_LENGTH, 25);
+	        while (reader.next(key, value)) {
+	            bw.write(Kmer.recoverKmerFrom(k, key.getBytes(), 0, key.getLength()) + "\t" + value.toString());
+	            bw.newLine();
+	        }
+	        bw.close();
+	        dumped = new File(CONVERT_RESULT);
+		}
         
-		TestUtils.compareWithResult(new File(EXPECTED_PATH), new File(
-				DUMPED_RESULT));
+		TestUtils.compareWithResult(new File(EXPECTED_PATH), dumped);
 		return true;
 	}
 
diff --git a/genomix/genomix-core/src/test/resources/expected/result2 b/genomix/genomix-core/src/test/resources/expected/result2
index 2c44be3..a22dd28 100755
--- a/genomix/genomix-core/src/test/resources/expected/result2
+++ b/genomix/genomix-core/src/test/resources/expected/result2
@@ -1,4 +1,4 @@
-10 03	18	1
-31 00	1	1
-41 00	-128	1
-c4 00	17	1
\ No newline at end of file
+AATAG	|A	1
+ATAGA	A|A	1
+TAGAA	A|G	1
+AGAAG	T|	1