text output test pass
git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@2961 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java
index 3a53ef2..d8f0122 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/FileScanDescriptor.java
@@ -150,7 +150,7 @@
try {
byte adj = GENE_CODE.mergePreNextAdj(pre, next);
tupleBuilder.reset();
- tupleBuilder.addField(kmer, 0, byteNum + 1);
+ tupleBuilder.addField(kmer, 0, byteNum);
tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE,
adj);
diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
index 441b798..332b7d7 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
@@ -26,16 +26,16 @@
public void write(DataOutput output, ITupleReference tuple)
throws HyracksDataException {
try {
- output.writeChars(Kmer.recoverKmerFrom(KMER,
+ output.write(Kmer.recoverKmerFrom(KMER,
tuple.getFieldData(0), tuple.getFieldStart(0),
- tuple.getFieldLength(0)));
- output.writeChar('\t');
- output.writeChars(Kmer.recoverAdjacent(tuple
- .getFieldData(1)[tuple.getFieldStart(1)]));
- output.writeChar('\t');
- output.writeInt((int)tuple
- .getFieldData(2)[tuple.getFieldStart(2)]);
- output.writeChar('\n');
+ tuple.getFieldLength(0)).getBytes());
+ output.writeByte('\t');
+ output.write(Kmer.GENE_CODE.getSymbolFromBitMap(tuple
+ .getFieldData(1)[tuple.getFieldStart(1)]).getBytes());
+ output.writeByte('\t');
+ output.write(String.valueOf((int)tuple
+ .getFieldData(2)[tuple.getFieldStart(2)]).getBytes());
+ output.writeByte('\n');
} catch (IOException e) {
throw new HyracksDataException(e);
}
diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
index 1b13760..bc65204 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
@@ -86,7 +86,7 @@
try {
byte adj = GENE_CODE.mergePreNextAdj(pre, next);
tupleBuilder.reset();
- tupleBuilder.addField(kmer, 0, byteNum + 1);
+ tupleBuilder.addField(kmer, 0, byteNum);
tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE,
adj);
diff --git a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java
index 19d754d..8517432 100644
--- a/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java
+++ b/genomix/genomix-core/src/main/java/edu/uci/ics/genomix/type/Kmer.java
@@ -1,6 +1,5 @@
package edu.uci.ics.genomix.type;
-
public class Kmer {
public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
@@ -8,7 +7,8 @@
public final static class GENE_CODE {
/**
- * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL}
+ * make sure this 4 ids equal to the sequence id of char in
+ * {@GENE_SYMBOL}
*/
public static final byte A = 0;
public static final byte C = 1;
@@ -67,24 +67,24 @@
}
return r;
}
-
- public static byte mergePreNextAdj(byte pre, byte next){
+
+ public static byte mergePreNextAdj(byte pre, byte next) {
return (byte) (pre << 4 | next & 0x0f);
}
-
+
public static String getSymbolFromBitMap(byte code) {
int left = (code >> 4) & 0x0F;
int right = code & 0x0F;
String str = new String();
- for(int i = A; i <= T ; i++){
- if ( (left & (1<<i)) != 0){
- str += GENE_SYMBOL[i];
+ for (int i = A; i <= T; i++) {
+ if ((left & (1 << i)) != 0) {
+ str += (char)GENE_SYMBOL[i];
}
}
str += '|';
- for(int i = A; i <= T ; i++){
- if ( (right & (1<<i)) != 0){
- str += GENE_SYMBOL[i];
+ for (int i = A; i <= T; i++) {
+ if ((right & (1 << i)) != 0) {
+ str += (char)GENE_SYMBOL[i];
}
}
return str;
@@ -94,94 +94,106 @@
public static String recoverKmerFrom(int k, byte[] keyData, int keyStart,
int keyLength) {
String strKmer = new String();
- int byteId = keyStart + keyLength-1;
+ int byteId = keyStart + keyLength - 1;
byte currentbyte = keyData[byteId];
- for(int geneCount = 0; geneCount < k ; geneCount++){
- if (geneCount % 4 == 0 && geneCount > 0){
+ for (int geneCount = 0; geneCount < k; geneCount++) {
+ if (geneCount % 4 == 0 && geneCount > 0) {
currentbyte = keyData[--byteId];
}
- strKmer += (char)GENE_SYMBOL[(currentbyte >> ((geneCount%4)*2)) & 0x03];
+ strKmer += (char) GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03];
}
return strKmer;
}
- public static String recoverAdjacent(byte number) {
- int incoming = (number & 0xF0) >> 4;
- int outgoing = number & 0x0F;
- return String.valueOf(incoming) + '|' + String.valueOf(outgoing);
- }
-
/**
- * Compress Kmer into bytes array
- * AATAG will compress as [0 0 0 G][A T A A]
- * @param kmer
- * @param input array
- * @param start position
+ * Compress Kmer into bytes array AATAG will compress as [0 0 0 G][A T A A]
+ *
+ * @param kmer
+ * @param input
+ * array
+ * @param start
+ * position
* @return initialed kmer array
*/
public static byte[] CompressKmer(int k, byte[] array, int start) {
final int byteNum = (byte) Math.ceil((double) k / 4.0);
- byte[] bytes = new byte[byteNum ];
+ byte[] bytes = new byte[byteNum];
byte l = 0;
int bytecount = 0;
- int bcount = byteNum-1;
- for (int i = start; i<start +k; i++){
+ int bcount = byteNum - 1;
+ for (int i = start; i < start + k; i++) {
byte code = GENE_CODE.getCodeFromSymbol(array[i]);
l |= (byte) (code << bytecount);
- bytecount +=2;
- if (bytecount == 8){
+ bytecount += 2;
+ if (bytecount == 8) {
bytes[bcount--] = l;
l = 0;
- bytecount= 0;
+ bytecount = 0;
}
}
- if (bcount >= 0){
- bytes[0]=l;
+ if (bcount >= 0) {
+ bytes[0] = l;
}
return bytes;
}
/**
* Shift Kmer to accept new input
+ *
* @param kmer
- * @param bytes Kmer Array
- * @param c Input new gene character
+ * @param bytes
+ * Kmer Array
+ * @param c
+ * Input new gene character
* @return the shiftout gene, in gene code format
*/
public static byte MoveKmer(int k, byte[] kmer, byte c) {
int byteNum = (byte) Math.ceil((double) k / 4.0);
- byte output = (byte) (kmer[byteNum-1] & 0x03);
- for(int i = byteNum-1; i >0; i--){
- byte in = (byte) (kmer[i-1] & 0x03);
- kmer[i] = (byte) ((kmer[i] >>> 2) | (in << 6));
+ byte output = (byte) (kmer[byteNum - 1] & 0x03);
+ for (int i = byteNum - 1; i > 0; i--) {
+ byte in = (byte) (kmer[i - 1] & 0x03);
+ kmer[i] = (byte) (((kmer[i] >>> 2) & 0x3f) | (in << 6));
}
-
- int pos = ((k-1) % 4) *2;
+
+ int pos = ((k - 1) % 4) * 2;
byte code = (byte) (GENE_CODE.getCodeFromSymbol(c) << pos);
kmer[0] = (byte) ((kmer[0] >>> 2) | code);
- return output;
+ return (byte) (1 << output);
}
- public static void main(String [] argv){
- byte[] array = {'A','A','T','A','G','C','A','G'};
+ public static void main(String[] argv) {
+ byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
int k = 5;
byte[] kmer = CompressKmer(k, array, 0);
- for (byte b : kmer){
- System.out.print((int)b);
- System.out.print(' ');
- }
- System.out.println();
- System.out.println(recoverKmerFrom(k,kmer,0,kmer.length));
-
- byte out = MoveKmer(k, kmer, array[k]);
-
- System.out.println((int)out);
- for (byte b : kmer){
+ for (byte b : kmer) {
System.out.print(Integer.toBinaryString(b));
System.out.print(' ');
}
System.out.println();
- System.out.println(recoverKmerFrom(k,kmer,0,kmer.length));
+ System.out.println(recoverKmerFrom(k, kmer, 0, kmer.length));
+
+ for (int i = k; i < array.length-1; i++) {
+ byte out = MoveKmer(k, kmer, array[i]);
+
+ System.out.println((int) out);
+ for (byte b : kmer) {
+ System.out.print(Integer.toBinaryString(b));
+ System.out.print(' ');
+ }
+ System.out.println();
+ System.out.println(recoverKmerFrom(k, kmer, 0, kmer.length));
+ }
+
+ byte out = MoveKmer(k, kmer, array[array.length - 1]);
+
+ System.out.println((int) out);
+ for (byte b : kmer) {
+ System.out.print(Integer.toBinaryString(b));
+ System.out.print(' ');
+ }
+ System.out.println();
+ System.out.println(recoverKmerFrom(k, kmer, 0, kmer.length));
+
}
}
diff --git a/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java b/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java
index 07c2e1d..c185d63 100644
--- a/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java
+++ b/genomix/genomix-core/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTestCase.java
@@ -17,6 +17,7 @@
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
@@ -28,6 +29,7 @@
import edu.uci.ics.genomix.driver.Driver;
import edu.uci.ics.genomix.driver.Driver.Plan;
import edu.uci.ics.genomix.job.GenomixJob;
+import edu.uci.ics.genomix.type.Kmer;
import edu.uci.ics.genomix.type.KmerCountValue;
import edu.uci.ics.hyracks.hdfs.utils.HyracksUtils;
import edu.uci.ics.hyracks.hdfs.utils.TestUtils;
@@ -38,7 +40,8 @@
private static final String DATA_PATH = "src/test/resources/data/webmap/text.txt";
private static final String HDFS_INPUT_PATH = "/webmap";
- private static final String HDFS_OUTPUT_PATH = "/webmap_result/";
+ private static final String HDFS_OUTPUT_PATH = "/webmap_result";
+ private static final String HDFS_OUTPUT_FILE = HDFS_OUTPUT_PATH + "/part-0";
private static final String DUMPED_RESULT = ACTUAL_RESULT_DIR
+ HDFS_OUTPUT_PATH + "/merged.txt";
@@ -125,19 +128,20 @@
Assert.assertEquals(true, checkResults());
}
- // @Test
+// @Test
public void TestPreClusterGroupby() throws Exception {
cleanUpReEntry();
conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+ conf.set(GenomixJob.OUTPUT_FORMAT, "text");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
Assert.assertEquals(true, checkResults());
}
- // @Test
+// @Test
public void TestHybridGroupby() throws Exception {
cleanUpReEntry();
conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
- conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
+ conf.set(GenomixJob.OUTPUT_FORMAT, "text");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
Assert.assertEquals(true, checkResults());
}
@@ -146,23 +150,27 @@
FileUtil.copyMerge(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH),
FileSystem.getLocal(new Configuration()), new Path(
DUMPED_RESULT), false, conf, null);
-
- SequenceFile.Reader reader = null;
- Path path = new Path(DUMPED_RESULT);
- FileSystem dfs = FileSystem.get(conf);
- reader = new SequenceFile.Reader(dfs, path, conf);
- BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
- KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
- File filePathTo = new File(CONVERT_RESULT);
- BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
- while (reader.next(key, value)) {
- bw.write(key + "\t" + value.toString());
- bw.newLine();
- }
- bw.close();
+ File dumped = new File( DUMPED_RESULT);
+ String format = conf.get(GenomixJob.OUTPUT_FORMAT);
+ if( !"text".equalsIgnoreCase(format)){
+ SequenceFile.Reader reader = null;
+ Path path = new Path(HDFS_OUTPUT_FILE);
+ FileSystem dfs = FileSystem.get(conf);
+ reader = new SequenceFile.Reader(dfs, path, conf);
+ BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+ File filePathTo = new File(CONVERT_RESULT);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+ int k = conf.getInt(GenomixJob.KMER_LENGTH, 25);
+ while (reader.next(key, value)) {
+ bw.write(Kmer.recoverKmerFrom(k, key.getBytes(), 0, key.getLength()) + "\t" + value.toString());
+ bw.newLine();
+ }
+ bw.close();
+ dumped = new File(CONVERT_RESULT);
+ }
- TestUtils.compareWithResult(new File(EXPECTED_PATH), new File(
- DUMPED_RESULT));
+ TestUtils.compareWithResult(new File(EXPECTED_PATH), dumped);
return true;
}
diff --git a/genomix/genomix-core/src/test/resources/expected/result2 b/genomix/genomix-core/src/test/resources/expected/result2
index 2c44be3..a22dd28 100755
--- a/genomix/genomix-core/src/test/resources/expected/result2
+++ b/genomix/genomix-core/src/test/resources/expected/result2
@@ -1,4 +1,4 @@
-10 03 18 1
-31 00 1 1
-41 00 -128 1
-c4 00 17 1
\ No newline at end of file
+AATAG |A 1
+ATAGA A|A 1
+TAGAA A|G 1
+AGAAG T| 1