add reversed kmer option
git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@3301 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
index 70b8bd6..e405424 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
@@ -92,13 +92,13 @@
StringBuilder str = new StringBuilder();
for (int i = A; i <= T; i++) {
if ((left & (1 << i)) != 0) {
- str.append((char)GENE_SYMBOL[i]);
+ str.append((char) GENE_SYMBOL[i]);
}
}
str.append('|');
for (int i = A; i <= T; i++) {
if ((right & (1 << i)) != 0) {
- str.append((char)GENE_SYMBOL[i]);
+ str.append((char) GENE_SYMBOL[i]);
}
}
return str.toString();
@@ -118,17 +118,17 @@
}
return strKmer.toString();
}
-
- public static int getByteNumFromK(int k){
- int x = k/4;
- if (k%4 !=0){
- x+=1;
+
+ public static int getByteNumFromK(int k) {
+ int x = k / 4;
+ if (k % 4 != 0) {
+ x += 1;
}
return x;
}
/**
- * Compress Kmer into bytes array AATAG will compress as [0 0 0 G][A T A A]
+ * Compress Kmer into bytes array AATAG will compress as [0x000G, 0xATAA]
*
* @param kmer
* @param input
@@ -137,7 +137,7 @@
* position
* @return initialed kmer array
*/
- public static byte[] CompressKmer(int k, byte[] array, int start) {
+ public static byte[] compressKmer(int k, byte[] array, int start) {
final int byteNum = getByteNumFromK(k);
byte[] bytes = new byte[byteNum];
@@ -170,7 +170,7 @@
* Input new gene character
* @return the shiftout gene, in gene code format
*/
- public static byte MoveKmer(int k, byte[] kmer, byte c) {
+ public static byte moveKmer(int k, byte[] kmer, byte c) {
int byteNum = kmer.length;
byte output = (byte) (kmer[byteNum - 1] & 0x03);
for (int i = byteNum - 1; i > 0; i--) {
@@ -178,12 +178,101 @@
kmer[i] = (byte) (((kmer[i] >>> 2) & 0x3f) | (in << 6));
}
- int pos = ((k - 1) % 4) * 2;
+ int pos = ((k - 1) % 4) << 1;
byte code = (byte) (GENE_CODE.getCodeFromSymbol(c) << pos);
kmer[0] = (byte) (((kmer[0] >>> 2) & 0x3f) | code);
return (byte) (1 << output);
}
+ public static byte reverseKmerByte(byte k) {
+ int x = (((k >> 2) & 0x33) | ((k << 2) & 0xcc));
+ return (byte) (((x >> 4) & 0x0f) | ((x << 4) & 0xf0));
+ }
+ public static byte[] reverseKmer(int k, byte[] kmer) {
+ byte[] reverseKmer = new byte[kmer.length];
+ int curPosAtKmer = ((k - 1) % 4) << 1;
+ int curByteAtKmer = 0;
+
+ int curPosAtReverse = 0;
+ int curByteAtReverse = reverseKmer.length - 1;
+ reverseKmer[curByteAtReverse] = 0;
+ for (int i = 0; i < k; i++) {
+ byte gene = (byte) ((kmer[curByteAtKmer] >> curPosAtKmer) & 0x03);
+ reverseKmer[curByteAtReverse] |= gene << curPosAtReverse;
+ curPosAtReverse += 2;
+ if (curPosAtReverse >= 8) {
+ curPosAtReverse = 0;
+ reverseKmer[--curByteAtReverse] = 0;
+ }
+ curPosAtKmer -= 2;
+ if (curPosAtKmer < 0) {
+ curPosAtKmer = 6;
+ curByteAtKmer++;
+ }
+ }
+
+ return reverseKmer;
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param kmer
+ * @param input
+ * array
+ * @param start
+ * position
+ * @return initialed kmer array
+ */
+ public static byte[] compressKmerReverse(int k, byte[] array, int start) {
+ final int byteNum = getByteNumFromK(k);
+ byte[] bytes = new byte[byteNum];
+
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = byteNum - 1;
+ for (int i = start + k - 1; i >= 0; i--) {
+ byte code = GENE_CODE.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ return bytes;
+ }
+
+ /**
+ * Shift Kmer to accept new input
+ *
+ * @param kmer
+ * @param bytes
+ * Kmer Array
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public static byte moveKmerReverse(int k, byte[] kmer, byte c) {
+ int pos = ((k - 1) % 4) << 1;
+ byte output = (byte) ((kmer[0] >> pos) & 0x03);
+ for (int i = 0; i < kmer.length - 1; i++) {
+ byte in = (byte) ((kmer[i + 1] >> 6) & 0x03);
+ kmer[i] = (byte) ((kmer[i] << 2) | in);
+ }
+ // (k%4) * 2
+ if (k % 4 != 0) {
+ kmer[0] &= (1 << ((k % 4) << 1)) - 1;
+ }
+ kmer[kmer.length - 1] = (byte) ((kmer[kmer.length - 1] << 2) | GENE_CODE
+ .getCodeFromSymbol(c));
+ return (byte) (1 << output);
+ }
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index ea951ba..1d41167 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -21,130 +21,131 @@
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
-public class KmerBytesWritable extends BinaryComparable implements WritableComparable<BinaryComparable> {
- private static final int LENGTH_BYTES = 4;
- private static final byte[] EMPTY_BYTES = {};
- private byte size;
- private byte[] bytes;
+public class KmerBytesWritable extends BinaryComparable implements
+ WritableComparable<BinaryComparable> {
+ private static final int LENGTH_BYTES = 4;
+ private static final byte[] EMPTY_BYTES = {};
+ private byte size;
+ private byte[] bytes;
- public KmerBytesWritable() {
- this(EMPTY_BYTES);
- }
+ public KmerBytesWritable() {
+ this(EMPTY_BYTES);
+ }
- public KmerBytesWritable(byte[] bytes) {
- this.bytes = bytes;
- this.size = (byte) bytes.length;
- }
+ public KmerBytesWritable(byte[] bytes) {
+ this.bytes = bytes;
+ this.size = (byte) bytes.length;
+ }
- @Override
- public byte[] getBytes() {
- return bytes;
- }
+ @Override
+ public byte[] getBytes() {
+ return bytes;
+ }
- @Deprecated
- public byte[] get() {
- return getBytes();
- }
+ @Deprecated
+ public byte[] get() {
+ return getBytes();
+ }
- @Override
- public int getLength() {
- return (int) size;
- }
+ @Override
+ public int getLength() {
+ return (int) size;
+ }
- @Deprecated
- public int getSize() {
- return getLength();
- }
+ @Deprecated
+ public int getSize() {
+ return getLength();
+ }
- public void setSize(byte size) {
- if ((int) size > getCapacity()) {
- setCapacity((byte) (size * 3 / 2));
- }
- this.size = size;
- }
+ public void setSize(byte size) {
+ if ((int) size > getCapacity()) {
+ setCapacity((byte) (size * 3 / 2));
+ }
+ this.size = size;
+ }
- public int getCapacity() {
- return bytes.length;
- }
+ public int getCapacity() {
+ return bytes.length;
+ }
- public void setCapacity(byte new_cap) {
- if (new_cap != getCapacity()) {
- byte[] new_data = new byte[new_cap];
- if (new_cap < size) {
- size = new_cap;
- }
- if (size != 0) {
- System.arraycopy(bytes, 0, new_data, 0, size);
- }
- bytes = new_data;
- }
- }
+ public void setCapacity(byte new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < size) {
+ size = new_cap;
+ }
+ if (size != 0) {
+ System.arraycopy(bytes, 0, new_data, 0, size);
+ }
+ bytes = new_data;
+ }
+ }
- public void set(KmerBytesWritable newData) {
- set(newData.bytes, (byte) 0, newData.size);
- }
+ public void set(KmerBytesWritable newData) {
+ set(newData.bytes, (byte) 0, newData.size);
+ }
- public void set(byte[] newData, byte offset, byte length) {
- setSize((byte) 0);
- setSize(length);
- System.arraycopy(newData, offset, bytes, 0, size);
- }
+ public void set(byte[] newData, byte offset, byte length) {
+ setSize((byte) 0);
+ setSize(length);
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
- public void readFields(DataInput in) throws IOException {
- setSize((byte) 0); // clear the old data
- setSize(in.readByte());
- in.readFully(bytes, 0, size);
- }
+ public void readFields(DataInput in) throws IOException {
+ setSize((byte) 0); // clear the old data
+ setSize(in.readByte());
+ in.readFully(bytes, 0, size);
+ }
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeByte(size);
- out.write(bytes, 0, size);
- }
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(size);
+ out.write(bytes, 0, size);
+ }
- @Override
- public int hashCode() {
- return super.hashCode();
- }
+ @Override
+ public int hashCode() {
+ return super.hashCode();
+ }
- @Override
- public boolean equals(Object right_obj) {
- if (right_obj instanceof KmerBytesWritable)
- return super.equals(right_obj);
- return false;
- }
+ @Override
+ public boolean equals(Object right_obj) {
+ if (right_obj instanceof KmerBytesWritable)
+ return super.equals(right_obj);
+ return false;
+ }
- @Override
- public String toString() {
- StringBuffer sb = new StringBuffer(3 * size);
- for (int idx = 0; idx < (int) size; idx++) {
- // if not the first, put a blank separator in
- if (idx != 0) {
- sb.append(' ');
- }
- String num = Integer.toHexString(0xff & bytes[idx]);
- // if it is only one digit, add a leading 0.
- if (num.length() < 2) {
- sb.append('0');
- }
- sb.append(num);
- }
- return sb.toString();
- }
+ @Override
+ public String toString() {
+ StringBuffer sb = new StringBuffer(3 * size);
+ for (int idx = 0; idx < (int) size; idx++) {
+ // if not the first, put a blank separator in
+ if (idx != 0) {
+ sb.append(' ');
+ }
+ String num = Integer.toHexString(0xff & bytes[idx]);
+ // if it is only one digit, add a leading 0.
+ if (num.length() < 2) {
+ sb.append('0');
+ }
+ sb.append(num);
+ }
+ return sb.toString();
+ }
- public static class Comparator extends WritableComparator {
- public Comparator() {
- super(KmerBytesWritable.class);
- }
+ public static class Comparator extends WritableComparator {
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- return compareBytes(b1, s1 + LENGTH_BYTES, l1 - LENGTH_BYTES, b2, s2 + LENGTH_BYTES, l2 - LENGTH_BYTES);
- }
- }
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ return compareBytes(b1, s1 + LENGTH_BYTES, l1 - LENGTH_BYTES, b2,
+ s2 + LENGTH_BYTES, l2 - LENGTH_BYTES);
+ }
+ }
- static { // register this comparator
- WritableComparator.define(KmerBytesWritable.class, new Comparator());
- }
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
}
-
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
index 080110b..87eaa87 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
@@ -20,15 +20,14 @@
import org.apache.hadoop.io.Writable;
-
-public class KmerCountValue implements Writable{
+public class KmerCountValue implements Writable {
private byte adjBitMap;
private byte count;
public KmerCountValue(byte bitmap, byte count) {
- reset(bitmap, count);
+ set(bitmap, count);
}
-
+
public KmerCountValue() {
adjBitMap = 0;
count = 0;
@@ -48,20 +47,24 @@
@Override
public String toString() {
- return Kmer.GENE_CODE.getSymbolFromBitMap(adjBitMap) + '\t' + String.valueOf(count);
+ return Kmer.GENE_CODE.getSymbolFromBitMap(adjBitMap) + '\t'
+ + String.valueOf(count);
}
- public void reset(byte bitmap, byte count) {
+ public void set(byte bitmap, byte count) {
this.adjBitMap = bitmap;
this.count = count;
}
+
public byte getAdjBitMap() {
- return adjBitMap;
- }
+ return adjBitMap;
+ }
+
public void setAdjBitMap(byte adjBitMap) {
this.adjBitMap = adjBitMap;
}
+
public byte getCount() {
- return count;
+ return count;
}
}
\ No newline at end of file
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java
index f391d9f..8dacea1 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java
@@ -12,31 +12,70 @@
@Test
public void TestCompressKmer() {
- byte[] kmer = Kmer.CompressKmer(k, array, 0);
+ byte[] kmer = Kmer.compressKmer(k, array, 0);
String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
Assert.assertEquals(result, "AATAGAA");
- kmer = Kmer.CompressKmer(k, array, 1);
+ kmer = Kmer.compressKmer(k, array, 1);
result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
Assert.assertEquals(result, "ATAGAAG");
}
@Test
public void TestMoveKmer(){
- byte[] kmer = Kmer.CompressKmer(k, array, 0);
+ byte[] kmer = Kmer.compressKmer(k, array, 0);
String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
Assert.assertEquals(result, "AATAGAA");
for (int i = k; i < array.length-1; i++) {
- Kmer.MoveKmer(k, kmer, array[i]);
+ Kmer.moveKmer(k, kmer, array[i]);
Assert.assertTrue(false);
}
- byte out = Kmer.MoveKmer(k, kmer, array[array.length - 1]);
+ byte out = Kmer.moveKmer(k, kmer, array[array.length - 1]);
Assert.assertEquals(out, Kmer.GENE_CODE.getAdjBit((byte) 'A'));
result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
Assert.assertEquals(result, "ATAGAAG");
}
+
+ @Test
+ public void TestReverseKmer(){
+ byte[] kmer = Kmer.compressKmer(k, array, 0);
+ String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
+ Assert.assertEquals(result, "AATAGAA");
+ byte[] reversed = Kmer.reverseKmer(k, kmer);
+ result = Kmer.recoverKmerFrom(k, reversed, 0, kmer.length);
+ Assert.assertEquals(result, "AAGATAA");
+ }
+
+ @Test
+ public void TestCompressKmerReverse() {
+ byte[] kmer = Kmer.compressKmerReverse(k, array, 0);
+ String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
+ Assert.assertEquals(result, "AAGATAA");
+
+ kmer = Kmer.compressKmerReverse(k, array, 1);
+ result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
+ Assert.assertEquals(result, "GAAGATA");
+ }
+
+ @Test
+ public void TestMoveKmerReverse(){
+ byte[] kmer = Kmer.compressKmerReverse(k, array, 0);
+ String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
+ Assert.assertEquals(result, "AAGATAA");
+
+ for (int i = k; i < array.length-1; i++) {
+ Kmer.moveKmerReverse(k, kmer, array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = Kmer.moveKmerReverse(k, kmer, array[array.length - 1]);
+ Assert.assertEquals(out, Kmer.GENE_CODE.getAdjBit((byte) 'A'));
+ result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
+ Assert.assertEquals(result, "GAAGATA");
+ }
+
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
index 5e61c19..1a33c7a 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
@@ -49,7 +49,7 @@
bytCount = (byte) 127;
else
bytCount = (byte) count;
- vaWriter.reset(groupByAdjList, bytCount);
+ vaWriter.set(groupByAdjList, bytCount);
output.collect(key, vaWriter);
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
index 837866c..443cceb 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
@@ -78,27 +78,27 @@
/** first kmer */
byte count = 0;
byte[] array = geneLine.getBytes();
- byte[] kmer = Kmer.CompressKmer(KMER_SIZE, array, 0);
+ byte[] kmer = Kmer.compressKmer(KMER_SIZE, array, 0);
byte pre = 0;
byte next = GENE_CODE.getAdjBit(array[KMER_SIZE]);
byte adj = GENE_CODE.mergePreNextAdj(pre, next);
- outputAdjList.reset(adj, count);
+ outputAdjList.set(adj, count);
outputKmer.set(kmer, 0, kmer.length);
output.collect(outputKmer, outputAdjList);
/** middle kmer */
for (int i = KMER_SIZE; i < array.length - 1; i++) {
- pre = Kmer.MoveKmer(KMER_SIZE, kmer, array[i]);
+ pre = Kmer.moveKmer(KMER_SIZE, kmer, array[i]);
next = GENE_CODE.getAdjBit(array[i + 1]);
adj = GENE_CODE.mergePreNextAdj(pre, next);
- outputAdjList.reset(adj, count);
+ outputAdjList.set(adj, count);
outputKmer.set(kmer, 0, kmer.length);
output.collect(outputKmer, outputAdjList);
}
/** last kmer */
- pre = Kmer.MoveKmer(KMER_SIZE, kmer, array[array.length - 1]);
+ pre = Kmer.moveKmer(KMER_SIZE, kmer, array[array.length - 1]);
next = 0;
adj = GENE_CODE.mergePreNextAdj(pre, next);
- outputAdjList.reset(adj, count);
+ outputAdjList.set(adj, count);
outputKmer.set(kmer, 0, kmer.length);
output.collect(outputKmer, outputAdjList);
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
index 70981da..03bad56 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
@@ -50,7 +50,7 @@
bytCount = (byte) 127;
else
bytCount = (byte) count;
- valWriter.reset(groupByAdjList, bytCount);
+ valWriter.set(groupByAdjList, bytCount);
output.collect(key, valWriter);
reporter.incrCounter(MyCounters.NUM_RECORDS, 1);
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
index 51e5221..8c3f277 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
@@ -52,7 +52,7 @@
byte bitmap = tuple.getFieldData(1)[tuple.getFieldStart(1)];
byte count = tuple.getFieldData(2)[tuple.getFieldStart(2)];
- reEnterCount.reset(bitmap, count);
+ reEnterCount.set(bitmap, count);
reEnterKey.set(kmer, keyStart, keyLength);
writer.append(reEnterKey, reEnterCount);
// @mark: this method can not used for read in hadoop 0.20.2.
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
index 5764d3f..e5b7fa9 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
@@ -25,10 +25,12 @@
private int k;
private int byteNum;
+ private boolean bReversed;
- public ReadsKeyValueParserFactory(int k) {
+ public ReadsKeyValueParserFactory(int k, boolean bGenerateReversed) {
this.k = k;
byteNum = (byte) Math.ceil((double) k / 4.0);
+ bReversed = bGenerateReversed;
}
@Override
@@ -56,22 +58,41 @@
private void SplitReads(byte[] array, IFrameWriter writer) {
/** first kmer */
- byte[] kmer = Kmer.CompressKmer(k, array, 0);
+ byte[] kmer = Kmer.compressKmer(k, array, 0);
byte pre = 0;
byte next = GENE_CODE.getAdjBit(array[k]);
InsertToFrame(kmer, pre, next, writer);
/** middle kmer */
for (int i = k; i < array.length - 1; i++) {
- pre = Kmer.MoveKmer(k, kmer, array[i]);
+ pre = Kmer.moveKmer(k, kmer, array[i]);
next = GENE_CODE.getAdjBit(array[i + 1]);
InsertToFrame(kmer, pre, next, writer);
}
/** last kmer */
- pre = Kmer.MoveKmer(k, kmer, array[array.length - 1]);
+ pre = Kmer.moveKmer(k, kmer, array[array.length - 1]);
next = 0;
InsertToFrame(kmer, pre, next, writer);
+
+ if (bReversed) {
+ /** first kmer */
+ kmer = Kmer.compressKmerReverse(k, array, 0);
+ next = 0;
+ pre = GENE_CODE.getAdjBit(array[k]);
+ InsertToFrame(kmer, pre, next, writer);
+ /** middle kmer */
+ for (int i = k; i < array.length - 1; i++) {
+ next = Kmer.moveKmerReverse(k, kmer, array[i]);
+ pre = GENE_CODE.getAdjBit(array[i + 1]);
+ InsertToFrame(kmer, pre, next, writer);
+ }
+ /** last kmer */
+ next = Kmer.moveKmerReverse(k, kmer,
+ array[array.length - 1]);
+ pre = 0;
+ InsertToFrame(kmer, pre, next, writer);
+ }
}
private void InsertToFrame(byte[] kmer, byte pre, byte next,
@@ -105,7 +126,7 @@
@Override
public void open(IFrameWriter writer) throws HyracksDataException {
// TODO Auto-generated method stub
-
+
}
@Override
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
index 39f181a..0751707 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
@@ -21,6 +21,8 @@
public static final String GROUPBY_TYPE = "genomix.graph.groupby.type";
/** Graph outputformat */
public static final String OUTPUT_FORMAT = "genomix.graph.output";
+ /** Get reversed Kmer Sequence */
+ public static final String REVERSED_KMER = "genomix.kmer.reversed";
/** Configurations used by hybrid groupby function in graph build phrase */
public static final String GROUPBY_HYBRID_INPUTSIZE = "genomix.graph.groupby.hybrid.inputsize";
@@ -28,8 +30,8 @@
public static final String GROUPBY_HYBRID_RECORDSIZE_SINGLE = "genomix.graph.groupby.hybrid.recordsize.single";
public static final String GROUPBY_HYBRID_RECORDSIZE_CROSS = "genomix.graph.groupby.hybrid.recordsize.cross";
public static final String GROUPBY_HYBRID_HASHLEVEL = "genomix.graph.groupby.hybrid.hashlevel";
-
- public static final int DEFAULT_KMER= 55;
+
+ public static final int DEFAULT_KMER = 55;
public static final int DEFAULT_FRAME_SIZE = 32768;
public static final int DEFAULT_FRAME_LIMIT = 4096;
public static final int DEFAULT_TABLE_SIZE = 10485767;
@@ -38,10 +40,12 @@
public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_SINGLE = 9;
public static final int DEFAULT_GROUPBY_HYBRID_HASHLEVEL = 1;
public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS = 13;
-
- public static final String DEFAULT_GROUPBY_TYPE ="hybrid";
- public static final String DEFAULT_OUTPUT_FORMAT ="binary";
-
+
+ public static final boolean DEFAULT_REVERSED = false;
+
+ public static final String DEFAULT_GROUPBY_TYPE = "hybrid";
+ public static final String DEFAULT_OUTPUT_FORMAT = "binary";
+
public GenomixJob() throws IOException {
super(new Configuration());
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
index b8e4219..683c0a1 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
@@ -66,6 +66,7 @@
private int tableSize;
private GroupbyType groupbyType;
private OutputFormat outputFormat;
+ private boolean bGenerateReversedKmer;
private AbstractOperatorDescriptor singleGrouper;
private IConnectorDescriptor connPartition;
@@ -207,7 +208,8 @@
}
LOG.info("HDFS read schedule " + log);
return new HDFSReadOperatorDescriptor(jobSpec, readOutputRec, job,
- splits, readSchedule, new ReadsKeyValueParserFactory(kmers));
+ splits, readSchedule, new ReadsKeyValueParserFactory(kmers,
+ bGenerateReversedKmer));
} catch (Exception e) {
throw new HyracksDataException(e);
}
@@ -300,6 +302,9 @@
GenomixJob.GROUPBY_HYBRID_RECORDSIZE_CROSS,
GenomixJob.DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS);
+ bGenerateReversedKmer = conf.getBoolean(GenomixJob.REVERSED_KMER,
+ GenomixJob.DEFAULT_REVERSED);
+
String type = conf.get(GenomixJob.GROUPBY_TYPE,
GenomixJob.DEFAULT_GROUPBY_TYPE);
if (type.equalsIgnoreCase("external")) {
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
index 3e80ab7..f9bf5b0 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
@@ -45,6 +45,7 @@
+ HDFS_OUTPUT_PATH + "/merged.txt";
private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
private static final String EXPECTED_PATH = "src/test/resources/expected/result2";
+ private static final String EXPECTED_REVERSE_PATH = "src/test/resources/expected/result_reverse";
private static final String HYRACKS_APP_NAME = "genomix";
private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR
@@ -125,33 +126,59 @@
TestPreClusterGroupby();
cleanUpReEntry();
TestHybridGroupby();
+ cleanUpReEntry();
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ TestExternalReversedGroupby();
+ cleanUpReEntry();
+ TestPreClusterReversedGroupby();
+ cleanUpReEntry();
+ TestHybridReversedGroupby();
}
public void TestExternalGroupby() throws Exception {
conf.set(GenomixJob.GROUPBY_TYPE, "external");
- conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
System.err.println("Testing ExternalGroupBy");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults());
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
}
public void TestPreClusterGroupby() throws Exception {
conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
- conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
System.err.println("Testing PreClusterGroupBy");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults());
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
}
public void TestHybridGroupby() throws Exception {
conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
- conf.set(GenomixJob.OUTPUT_FORMAT, "binary");
System.err.println("Testing HybridGroupBy");
driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults());
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
+
+ public void TestExternalReversedGroupby() throws Exception{
+ conf.set(GenomixJob.GROUPBY_TYPE, "external");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing ExternalGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
+ public void TestPreClusterReversedGroupby() throws Exception{
+ conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing PreclusterGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
+ public void TestHybridReversedGroupby() throws Exception{
+ conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing HybridGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
}
- private boolean checkResults() throws Exception {
+ private boolean checkResults(String expectedPath) throws Exception {
File dumped = null;
String format = conf.get(GenomixJob.OUTPUT_FORMAT);
if ("text".equalsIgnoreCase(format)) {
@@ -206,7 +233,7 @@
dumped = new File(CONVERT_RESULT);
}
- TestUtils.compareWithSortedResult(new File(EXPECTED_PATH), dumped);
+ TestUtils.compareWithSortedResult(new File(expectedPath), dumped);
return true;
}
diff --git a/genomix/genomix-hyracks/src/test/resources/expected/result_reverse b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
new file mode 100644
index 0000000..cf2712d
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
@@ -0,0 +1,8 @@
+AAGAT G|A 5
+AATAG |A 5
+AGAAG T| 5
+AGATA A|A 5
+ATAGA A|A 5
+GAAGA |T 5
+GATAA A| 5
+TAGAA A|G 5