make genomix-hadoop genomix-hyracks compilable
diff --git a/genomix/genomix-data/src/main/assembly/binary-assembly.xml b/genomix/genomix-data/src/main/assembly/binary-assembly.xml
index 0500499..68d424a 100644
--- a/genomix/genomix-data/src/main/assembly/binary-assembly.xml
+++ b/genomix/genomix-data/src/main/assembly/binary-assembly.xml
@@ -1,19 +1,19 @@
<assembly>
- <id>binary-assembly</id>
- <formats>
- <format>zip</format>
- <format>dir</format>
- </formats>
- <includeBaseDirectory>false</includeBaseDirectory>
- <fileSets>
- <fileSet>
- <directory>target/appassembler/bin</directory>
- <outputDirectory>bin</outputDirectory>
- <fileMode>0755</fileMode>
- </fileSet>
- <fileSet>
- <directory>target/appassembler/lib</directory>
- <outputDirectory>lib</outputDirectory>
- </fileSet>
- </fileSets>
+ <id>binary-assembly</id>
+ <formats>
+ <format>zip</format>
+ <format>dir</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <fileSets>
+ <fileSet>
+ <directory>target/appassembler/bin</directory>
+ <outputDirectory>bin</outputDirectory>
+ <fileMode>0755</fileMode>
+ </fileSet>
+ <fileSet>
+ <directory>target/appassembler/lib</directory>
+ <outputDirectory>lib</outputDirectory>
+ </fileSet>
+ </fileSets>
</assembly>
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java
index ef5a62b..1ed6f80 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java
@@ -1,126 +1,126 @@
package edu.uci.ics.genomix.type;
public class GeneCode {
- public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
- /**
- * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL
+ public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
+ /**
+ * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL
* }
- */
- public static final byte A = 0;
- public static final byte C = 1;
- public static final byte G = 2;
- public static final byte T = 3;
+ */
+ public static final byte A = 0;
+ public static final byte C = 1;
+ public static final byte G = 2;
+ public static final byte T = 3;
- public static byte getCodeFromSymbol(byte ch) {
- byte r = 0;
- switch (ch) {
- case 'A':
- case 'a':
- r = A;
- break;
- case 'C':
- case 'c':
- r = C;
- break;
- case 'G':
- case 'g':
- r = G;
- break;
- case 'T':
- case 't':
- r = T;
- break;
- }
- return r;
- }
+ public static byte getCodeFromSymbol(byte ch) {
+ byte r = 0;
+ switch (ch) {
+ case 'A':
+ case 'a':
+ r = A;
+ break;
+ case 'C':
+ case 'c':
+ r = C;
+ break;
+ case 'G':
+ case 'g':
+ r = G;
+ break;
+ case 'T':
+ case 't':
+ r = T;
+ break;
+ }
+ return r;
+ }
- public static byte getSymbolFromCode(byte code) {
- if (code > 3) {
- return '!';
- }
- return GENE_SYMBOL[code];
- }
+ public static byte getSymbolFromCode(byte code) {
+ if (code > 3) {
+ return '!';
+ }
+ return GENE_SYMBOL[code];
+ }
- public static byte getAdjBit(byte t) {
- byte r = 0;
- switch (t) {
- case 'A':
- case 'a':
- r = 1 << A;
- break;
- case 'C':
- case 'c':
- r = 1 << C;
- break;
- case 'G':
- case 'g':
- r = 1 << G;
- break;
- case 'T':
- case 't':
- r = 1 << T;
- break;
- }
- return r;
- }
+ public static byte getAdjBit(byte t) {
+ byte r = 0;
+ switch (t) {
+ case 'A':
+ case 'a':
+ r = 1 << A;
+ break;
+ case 'C':
+ case 'c':
+ r = 1 << C;
+ break;
+ case 'G':
+ case 'g':
+ r = 1 << G;
+ break;
+ case 'T':
+ case 't':
+ r = 1 << T;
+ break;
+ }
+ return r;
+ }
- /**
- * It works for path merge. Merge the kmer by his next, we need to make sure
- * the @{t} is a single neighbor.
- *
- * @param t
- * the neighbor code in BitMap
- * @return the genecode
- */
- public static byte getGeneCodeFromBitMap(byte t) {
- switch (t) {
- case 1 << A:
- return A;
- case 1 << C:
- return C;
- case 1 << G:
- return G;
- case 1 << T:
- return T;
- }
- return -1;
- }
-
- public static int countNumberOfBitSet(int i) {
- int c = 0;
- for (; i != 0; c++) {
- i &= i - 1;
- }
- return c;
- }
-
- public static int inDegree(byte bitmap) {
- return countNumberOfBitSet((bitmap >> 4) & 0x0f);
- }
+ /**
+ * It works for path merge. Merge the kmer by his next, we need to make sure
+ * the @{t} is a single neighbor.
+ *
+ * @param t
+ * the neighbor code in BitMap
+ * @return the genecode
+ */
+ public static byte getGeneCodeFromBitMap(byte t) {
+ switch (t) {
+ case 1 << A:
+ return A;
+ case 1 << C:
+ return C;
+ case 1 << G:
+ return G;
+ case 1 << T:
+ return T;
+ }
+ return -1;
+ }
- public static int outDegree(byte bitmap) {
- return countNumberOfBitSet(bitmap & 0x0f);
- }
+ public static int countNumberOfBitSet(int i) {
+ int c = 0;
+ for (; i != 0; c++) {
+ i &= i - 1;
+ }
+ return c;
+ }
- public static byte mergePreNextAdj(byte pre, byte next) {
- return (byte) (pre << 4 | (next & 0x0f));
- }
+ public static int inDegree(byte bitmap) {
+ return countNumberOfBitSet((bitmap >> 4) & 0x0f);
+ }
- public static String getSymbolFromBitMap(byte code) {
- int left = (code >> 4) & 0x0F;
- int right = code & 0x0F;
- StringBuilder str = new StringBuilder();
- for (int i = A; i <= T; i++) {
- if ((left & (1 << i)) != 0) {
- str.append((char) GENE_SYMBOL[i]);
- }
- }
- str.append('|');
- for (int i = A; i <= T; i++) {
- if ((right & (1 << i)) != 0) {
- str.append((char) GENE_SYMBOL[i]);
- }
- }
- return str.toString();
- }
+ public static int outDegree(byte bitmap) {
+ return countNumberOfBitSet(bitmap & 0x0f);
+ }
+
+ public static byte mergePreNextAdj(byte pre, byte next) {
+ return (byte) (pre << 4 | (next & 0x0f));
+ }
+
+ public static String getSymbolFromBitMap(byte code) {
+ int left = (code >> 4) & 0x0F;
+ int right = code & 0x0F;
+ StringBuilder str = new StringBuilder();
+ for (int i = A; i <= T; i++) {
+ if ((left & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ str.append('|');
+ for (int i = A; i <= T; i++) {
+ if ((right & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ return str.toString();
+ }
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index e8d3e67..ded9f36 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -26,225 +26,228 @@
* It was used to generate the graph in which phase the kmer length doesn't change.
* Thus the size of bytes doesn't change either.
*/
-public class KmerBytesWritable extends BinaryComparable implements
- WritableComparable<BinaryComparable> {
- protected int size;
- protected byte[] bytes;
- protected int kmerlength;
+public class KmerBytesWritable extends BinaryComparable implements WritableComparable<BinaryComparable> {
+ protected int size;
+ protected byte[] bytes;
+ protected int kmerlength;
- /**
- * Initial Kmer space by kmerlength
- *
- * @param k
- * kmerlength
- */
- public KmerBytesWritable(int k) {
- this.kmerlength = k;
- this.size = KmerUtil.getByteNumFromK(kmerlength);
- this.bytes = new byte[this.size];
- }
+ /**
+ * Initial Kmer space by kmerlength
+ *
+ * @param k
+ * kmerlength
+ */
+ public KmerBytesWritable(int k) {
+ this.kmerlength = k;
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ this.bytes = new byte[this.size];
+ }
- public KmerBytesWritable(KmerBytesWritable right) {
- this.kmerlength = right.kmerlength;
- this.size = right.size;
- this.bytes = new byte[right.size];
- set(right);
- }
+ public KmerBytesWritable(KmerBytesWritable right) {
+ this.kmerlength = right.kmerlength;
+ this.size = right.size;
+ this.bytes = new byte[right.size];
+ set(right);
+ }
- public byte getGeneCodeAtPosition(int pos) {
- if (pos >= kmerlength) {
- return -1;
- }
- int posByte = pos / 4;
- int shift = (pos % 4) << 1;
- return (byte) ((bytes[size - 1 - posByte] >> shift) & 0x3);
- }
+ public byte getGeneCodeAtPosition(int pos) {
+ if (pos >= kmerlength) {
+ return -1;
+ }
+ int posByte = pos / 4;
+ int shift = (pos % 4) << 1;
+ return (byte) ((bytes[size - 1 - posByte] >> shift) & 0x3);
+ }
- public int getKmerLength() {
- return this.kmerlength;
- }
+ public int getKmerLength() {
+ return this.kmerlength;
+ }
- @Override
- public byte[] getBytes() {
- return bytes;
- }
+ @Override
+ public byte[] getBytes() {
+ return bytes;
+ }
- @Override
- public int getLength() {
- return size;
- }
+ @Override
+ public int getLength() {
+ return size;
+ }
- /**
- * Read Kmer from read text into bytes array e.g. AATAG will compress as
- * [0x000G, 0xATAA]
- *
- * @param k
- * @param array
- * @param start
- */
- public void setByRead(byte[] array, int start) {
- byte l = 0;
- int bytecount = 0;
- int bcount = this.size - 1;
- for (int i = start; i < start + kmerlength; i++) {
- byte code = GeneCode.getCodeFromSymbol(array[i]);
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[0] = l;
- }
- }
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = this.size - 1;
+ for (int i = start; i < start + kmerlength; i++) {
+ byte code = GeneCode.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ }
- /**
- * Compress Reversed Kmer into bytes array AATAG will compress as
- * [0x000A,0xATAG]
- *
- * @param input
- * array
- * @param start
- * position
- */
- public void setByReadReverse(byte[] array, int start) {
- byte l = 0;
- int bytecount = 0;
- int bcount = size - 1;
- for (int i = start + kmerlength - 1; i >= 0; i--) {
- byte code = GeneCode.getCodeFromSymbol(array[i]);
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[0] = l;
- }
- }
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = size - 1;
+ for (int i = start + kmerlength - 1; i >= 0; i--) {
+ byte code = GeneCode.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ }
- /**
- * Shift Kmer to accept new char input
- *
- * @param c
- * Input new gene character
- * @return the shift out gene, in gene code format
- */
- public byte shiftKmerWithNextChar(byte c) {
- return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
- }
+ /**
+ * Shift Kmer to accept new char input
+ *
+ * @param c
+ * Input new gene character
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextChar(byte c) {
+ return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
+ }
- /**
- * Shift Kmer to accept new gene code
- *
- * @param c
- * Input new gene code
- * @return the shift out gene, in gene code format
- */
- public byte shiftKmerWithNextCode(byte c) {
- byte output = (byte) (bytes[size - 1] & 0x03);
- for (int i = size - 1; i > 0; i--) {
- byte in = (byte) (bytes[i - 1] & 0x03);
- bytes[i] = (byte) (((bytes[i] >>> 2) & 0x3f) | (in << 6));
- }
- int pos = ((kmerlength - 1) % 4) << 1;
- byte code = (byte) (c << pos);
- bytes[0] = (byte) (((bytes[0] >>> 2) & 0x3f) | code);
- return (byte) (1 << output);
- }
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextCode(byte c) {
+ byte output = (byte) (bytes[size - 1] & 0x03);
+ for (int i = size - 1; i > 0; i--) {
+ byte in = (byte) (bytes[i - 1] & 0x03);
+ bytes[i] = (byte) (((bytes[i] >>> 2) & 0x3f) | (in << 6));
+ }
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte code = (byte) (c << pos);
+ bytes[0] = (byte) (((bytes[0] >>> 2) & 0x3f) | code);
+ return (byte) (1 << output);
+ }
- /**
- * Shift Kmer to accept new input char
- *
- * @param c
- * Input new gene character
- * @return the shiftout gene, in gene code format
- */
- public byte shiftKmerWithPreChar(byte c) {
- return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
- }
+ /**
+ * Shift Kmer to accept new input char
+ *
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreChar(byte c) {
+ return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
+ }
- /**
- * Shift Kmer to accept new gene code
- *
- * @param c
- * Input new gene code
- * @return the shiftout gene, in gene code format
- */
- public byte shiftKmerWithPreCode(byte c) {
- int pos = ((kmerlength - 1) % 4) << 1;
- byte output = (byte) ((bytes[0] >> pos) & 0x03);
- for (int i = 0; i < size - 1; i++) {
- byte in = (byte) ((bytes[i + 1] >> 6) & 0x03);
- bytes[i] = (byte) ((bytes[i] << 2) | in);
- }
- // (k%4) * 2
- if (kmerlength % 4 != 0) {
- bytes[0] &= (1 << ((kmerlength % 4) << 1)) - 1;
- }
- bytes[size - 1] = (byte) ((bytes[size - 1] << 2) | c);
- return (byte) (1 << output);
- }
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreCode(byte c) {
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte output = (byte) ((bytes[0] >> pos) & 0x03);
+ for (int i = 0; i < size - 1; i++) {
+ byte in = (byte) ((bytes[i + 1] >> 6) & 0x03);
+ bytes[i] = (byte) ((bytes[i] << 2) | in);
+ }
+ // (k%4) * 2
+ if (kmerlength % 4 != 0) {
+ bytes[0] &= (1 << ((kmerlength % 4) << 1)) - 1;
+ }
+ bytes[size - 1] = (byte) ((bytes[size - 1] << 2) | c);
+ return (byte) (1 << output);
+ }
- public void set(KmerBytesWritable newData) {
- set(newData.bytes, 0, newData.size);
- }
+ public void set(KmerBytesWritable newData) {
+ set(newData.bytes, 0, newData.size);
+ }
- public void set(byte[] newData, int offset, int length) {
- System.arraycopy(newData, offset, bytes, 0, size);
- }
+ public void set(byte[] newData, int offset, int length) {
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
- /**
- * Don't read the kmerlength from datastream,
- * Read it from configuration
- */
- @Override
- public void readFields(DataInput in) throws IOException {
- in.readFully(bytes, 0, size);
- }
+ /**
+ * Don't read the kmerlength from datastream,
+ * Read it from configuration
+ */
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.kmerlength = in.readInt();
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ if ( this.bytes.length < this.size){
+ this.bytes = new byte[this.size];
+ }
+ in.readFully(bytes, 0, size);
+ }
- @Override
- public void write(DataOutput out) throws IOException {
- out.write(bytes, 0, size);
- }
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(kmerlength);
+ out.write(bytes, 0, size);
+ }
- @Override
- public int hashCode() {
- return super.hashCode() * this.kmerlength;
- }
+ @Override
+ public int hashCode() {
+ return super.hashCode() * this.kmerlength;
+ }
- @Override
- public boolean equals(Object right_obj) {
- if (right_obj instanceof KmerBytesWritable)
- return this.kmerlength == ((KmerBytesWritable) right_obj).kmerlength
- && super.equals(right_obj);
- return false;
- }
+ @Override
+ public boolean equals(Object right_obj) {
+ if (right_obj instanceof KmerBytesWritable)
+ return this.kmerlength == ((KmerBytesWritable) right_obj).kmerlength && super.equals(right_obj);
+ return false;
+ }
- @Override
- public String toString() {
- return KmerUtil.recoverKmerFrom(this.kmerlength, this.getBytes(), 0,
- this.getLength());
- }
+ @Override
+ public String toString() {
+ return KmerUtil.recoverKmerFrom(this.kmerlength, this.getBytes(), 0, this.getLength());
+ }
- public static class Comparator extends WritableComparator {
- public Comparator() {
- super(KmerBytesWritable.class);
- }
+ public static class Comparator extends WritableComparator {
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- return compareBytes(b1, s1, l1, b2, s2, l2);
- }
- }
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ return compareBytes(b1, s1, l1, b2, s2, l2);
+ }
+ }
- static { // register this comparator
- WritableComparator.define(KmerBytesWritable.class, new Comparator());
- }
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
index 60ad5a3..fab7001 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
@@ -21,50 +21,49 @@
import org.apache.hadoop.io.Writable;
public class KmerCountValue implements Writable {
- private byte adjBitMap;
- private byte count;
+ private byte adjBitMap;
+ private byte count;
- public KmerCountValue(byte bitmap, byte count) {
- set(bitmap, count);
- }
+ public KmerCountValue(byte bitmap, byte count) {
+ set(bitmap, count);
+ }
- public KmerCountValue() {
- adjBitMap = 0;
- count = 0;
- }
+ public KmerCountValue() {
+ adjBitMap = 0;
+ count = 0;
+ }
- @Override
- public void readFields(DataInput arg0) throws IOException {
- adjBitMap = arg0.readByte();
- count = arg0.readByte();
- }
+ @Override
+ public void readFields(DataInput arg0) throws IOException {
+ adjBitMap = arg0.readByte();
+ count = arg0.readByte();
+ }
- @Override
- public void write(DataOutput arg0) throws IOException {
- arg0.writeByte(adjBitMap);
- arg0.writeByte(count);
- }
+ @Override
+ public void write(DataOutput arg0) throws IOException {
+ arg0.writeByte(adjBitMap);
+ arg0.writeByte(count);
+ }
- @Override
- public String toString() {
- return GeneCode.getSymbolFromBitMap(adjBitMap) + '\t'
- + String.valueOf(count);
- }
+ @Override
+ public String toString() {
+ return GeneCode.getSymbolFromBitMap(adjBitMap) + '\t' + String.valueOf(count);
+ }
- public void set(byte bitmap, byte count) {
- this.adjBitMap = bitmap;
- this.count = count;
- }
+ public void set(byte bitmap, byte count) {
+ this.adjBitMap = bitmap;
+ this.count = count;
+ }
- public byte getAdjBitMap() {
- return adjBitMap;
- }
+ public byte getAdjBitMap() {
+ return adjBitMap;
+ }
- public void setAdjBitMap(byte adjBitMap) {
- this.adjBitMap = adjBitMap;
- }
+ public void setAdjBitMap(byte adjBitMap) {
+ this.adjBitMap = adjBitMap;
+ }
- public byte getCount() {
- return count;
- }
+ public byte getCount() {
+ return count;
+ }
}
\ No newline at end of file
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
index 82fe1a1..9dc1dde 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
@@ -1,33 +1,31 @@
package edu.uci.ics.genomix.type;
-
public class KmerUtil {
- public static int getByteNumFromK(int k) {
- int x = k / 4;
- if (k % 4 != 0) {
- x += 1;
- }
- return x;
- }
-
- public static byte reverseKmerByte(byte k) {
- int x = (((k >> 2) & 0x33) | ((k << 2) & 0xcc));
- return (byte) (((x >> 4) & 0x0f) | ((x << 4) & 0xf0));
- }
-
- public static String recoverKmerFrom(int k, byte[] keyData, int keyStart,
- int keyLength) {
- StringBuilder strKmer = new StringBuilder();
- int byteId = keyStart + keyLength - 1;
- byte currentbyte = keyData[byteId];
- for (int geneCount = 0; geneCount < k; geneCount++) {
- if (geneCount % 4 == 0 && geneCount > 0) {
- currentbyte = keyData[--byteId];
- }
- strKmer.append((char) GeneCode.GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03]);
- }
- return strKmer.toString();
- }
-
+ public static int getByteNumFromK(int k) {
+ int x = k / 4;
+ if (k % 4 != 0) {
+ x += 1;
+ }
+ return x;
+ }
+
+ public static byte reverseKmerByte(byte k) {
+ int x = (((k >> 2) & 0x33) | ((k << 2) & 0xcc));
+ return (byte) (((x >> 4) & 0x0f) | ((x << 4) & 0xf0));
+ }
+
+ public static String recoverKmerFrom(int k, byte[] keyData, int keyStart, int keyLength) {
+ StringBuilder strKmer = new StringBuilder();
+ int byteId = keyStart + keyLength - 1;
+ byte currentbyte = keyData[byteId];
+ for (int geneCount = 0; geneCount < k; geneCount++) {
+ if (geneCount % 4 == 0 && geneCount > 0) {
+ currentbyte = keyData[--byteId];
+ }
+ strKmer.append((char) GeneCode.GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03]);
+ }
+ return strKmer.toString();
+ }
+
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
index 67de889..b2af5ee 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -6,119 +6,110 @@
import org.apache.hadoop.io.WritableComparator;
-public class VKmerBytesWritable extends KmerBytesWritable{
+public class VKmerBytesWritable extends KmerBytesWritable {
+ public static final int DEFAULT_KMER_LENGTH = 21;
+
+ public VKmerBytesWritable(){
+ this(DEFAULT_KMER_LENGTH);
+ }
+
+ public VKmerBytesWritable(int k) {
+ super(k);
+ }
- public VKmerBytesWritable(int k) {
- super(k);
- }
-
- public VKmerBytesWritable(KmerBytesWritable other){
- super(other);
- }
+ public VKmerBytesWritable(KmerBytesWritable other) {
+ super(other);
+ }
- public void setSize(int size) {
- if (size > getCapacity()) {
- setCapacity( (size * 3 / 2));
- }
- this.size = size;
- }
+ public void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity((size * 3 / 2));
+ }
+ this.size = size;
+ }
- public int getCapacity() {
- return bytes.length;
- }
+ public int getCapacity() {
+ return bytes.length;
+ }
- public void setCapacity(int new_cap) {
- if (new_cap != getCapacity()) {
- byte[] new_data = new byte[new_cap];
- if (new_cap < size) {
- size = new_cap;
- }
- if (size != 0) {
- System.arraycopy(bytes, 0, new_data, 0, size);
- }
- bytes = new_data;
- }
- }
-
- /**
- * Read Kmer from read text into bytes array e.g. AATAG will compress as
- * [0x000G, 0xATAA]
- *
- * @param k
- * @param array
- * @param start
- */
- public void setByRead(int k, byte[] array, int start) {
- reset(k);
- super.setByRead(array, start);
- }
+ public void setCapacity(int new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < size) {
+ size = new_cap;
+ }
+ if (size != 0) {
+ System.arraycopy(bytes, 0, new_data, 0, size);
+ }
+ bytes = new_data;
+ }
+ }
- /**
- * Compress Reversed Kmer into bytes array AATAG will compress as
- * [0x000A,0xATAG]
- *
- * @param input
- * array
- * @param start
- * position
- */
- public void setByReadReverse(int k, byte[] array, int start) {
- reset(k);
- super.setByReadReverse(array, start);
- }
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(int k, byte[] array, int start) {
+ reset(k);
+ super.setByRead(array, start);
+ }
- public void set(KmerBytesWritable newData) {
- set(newData.kmerlength, newData.bytes, 0, newData.size);
- }
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(int k, byte[] array, int start) {
+ reset(k);
+ super.setByReadReverse(array, start);
+ }
- public void set(int k, byte[] newData, int offset, int length) {
- reset(k);
- System.arraycopy(newData, offset, bytes, 0, size);
- }
+ public void set(KmerBytesWritable newData) {
+ set(newData.kmerlength, newData.bytes, 0, newData.size);
+ }
- /**
- * Reset array by kmerlength
- * @param k
- */
- public void reset(int k) {
- this.kmerlength = k;
- setSize( 0);
- setSize( KmerUtil.getByteNumFromK(k));
- }
+ public void set(int k, byte[] newData, int offset, int length) {
+ reset(k);
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
- @Override
- public void readFields(DataInput in) throws IOException {
- reset(in.readInt());
- in.readFully(bytes, 0, size);
- }
+ /**
+ * Reset array by kmerlength
+ *
+ * @param k
+ */
+ public void reset(int k) {
+ this.kmerlength = k;
+ setSize(0);
+ setSize(KmerUtil.getByteNumFromK(k));
+ }
- /**
- * Write the kmer to output
- * we don't need to output size, since size is related to kmerlength
- */
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeInt(this.kmerlength);
- out.write(bytes, 0, size);
- }
+ public static class Comparator extends WritableComparator {
+ public final int LEAD_BYTES = 4;
- public static class Comparator extends WritableComparator {
- public final int LEAD_BYTES = 4;
- public Comparator() {
- super(KmerBytesWritable.class);
- }
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- int kmerlength1 = readInt(b1,s1);
- int kmerlength2 = readInt(b2,s2);
- if (kmerlength1 == kmerlength2){
- compareBytes(b1, s1 + LEAD_BYTES, l1-LEAD_BYTES, b2, s2+LEAD_BYTES, l2-LEAD_BYTES);
- }
- return kmerlength1 - kmerlength2 ;
- }
- }
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = readInt(b1, s1);
+ int kmerlength2 = readInt(b2, s2);
+ if (kmerlength1 == kmerlength2) {
+ compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
+ }
+ return kmerlength1 - kmerlength2;
+ }
+ }
- static { // register this comparator
- WritableComparator.define(KmerBytesWritable.class, new Comparator());
- }
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
index a7bcc8b..dfc0ee3 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
@@ -1,241 +1,263 @@
package edu.uci.ics.genomix.type;
public class VKmerBytesWritableFactory {
- private VKmerBytesWritable kmer;
-
- public VKmerBytesWritableFactory(int k){
- kmer = new VKmerBytesWritable(k);
- }
+ private VKmerBytesWritable kmer;
- /**
- * Read Kmer from read text into bytes array e.g. AATAG will compress as
- * [0x000G, 0xATAA]
- *
- * @param k
- * @param array
- * @param start
- */
- public VKmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
- kmer.setByRead(k, array, start);
- return kmer;
- }
+ public VKmerBytesWritableFactory(int k) {
+ kmer = new VKmerBytesWritable(k);
+ }
- /**
- * Compress Reversed Kmer into bytes array AATAG will compress as
- * [0x000A,0xATAG]
- *
- * @param array
- * @param start
- */
- public VKmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
- kmer.setByReadReverse(k, array, start);
- return kmer;
- }
-
- /**
- * Get last kmer from kmer-chain.
- * e.g. kmerChain is AAGCTA, if k =5, it will
- * return AGCTA
- * @param k
- * @param kInChain
- * @param kmerChain
- * @return LastKmer bytes array
- */
- public VKmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
- if (lastK > kmerChain.getKmerLength()) {
- return null;
- }
- if (lastK == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
- return kmer;
- }
- kmer.reset(lastK);
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public VKmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
+ kmer.setByRead(k, array, start);
+ return kmer;
+ }
- /** from end to start */
- int byteInChain = kmerChain.getLength() - 1 - (kmerChain.getKmerLength() - lastK) / 4;
- int posInByteOfChain = ((kmerChain.getKmerLength() - lastK) % 4) << 1; // *2
- int byteInKmer = kmer.getLength() - 1;
- for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
- kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
- kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
- }
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param array
+ * @param start
+ */
+ public VKmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
+ kmer.setByReadReverse(k, array, start);
+ return kmer;
+ }
- /** last kmer byte */
- if (byteInKmer == 0) {
- kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
- }
- return kmer;
- }
-
- /**
- * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
- * return AAGCT
- *
- * @param k
- * @param kInChain
- * @param kmerChain
- * @return FirstKmer bytes array
- */
- public VKmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
- if (firstK > kmerChain.getKmerLength()) {
- return null;
- }
- if (firstK == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
- return kmer;
- }
- kmer.reset(firstK);
+ /**
+ * Get last kmer from kmer-chain.
+ * e.g. kmerChain is AAGCTA, if k =5, it will
+ * return AGCTA
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return LastKmer bytes array
+ */
+ public VKmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
+ if (lastK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (lastK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(lastK);
- int i = 1;
- for (; i < kmer.getLength(); i++) {
- kmer.getBytes()[kmer.getLength() - i] = kmerChain.getBytes()[kmerChain.getLength() - i];
- }
- int posInByteOfChain = (firstK % 4) << 1; // *2
- if (posInByteOfChain == 0) {
- kmer.getBytes()[0] = kmerChain.getBytes()[kmerChain.getLength() - i];
- } else {
- kmer.getBytes()[0] = (byte) (kmerChain.getBytes()[kmerChain.getLength() - i] & ((1 << posInByteOfChain) - 1));
- }
- return kmer;
- }
-
- /**
- * Merge kmer with next neighbor in gene-code format.
- * The k of new kmer will increase by 1
- * e.g. AAGCT merge with A => AAGCTA
- * @param k :input k of kmer
- * @param kmer : input bytes of kmer
- * @param nextCode: next neighbor in gene-code format
- * @return the merged Kmer, this K of this Kmer is k+1
- */
- public VKmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
- this.kmer.reset(kmer.getKmerLength()+1);
- for (int i = 1; i <= kmer.getLength(); i++) {
- this.kmer.getBytes()[this.kmer.getLength() - i] = kmer.getBytes()[kmer.getLength() - i];
- }
- if (this.kmer.getLength() > kmer.getLength()) {
- this.kmer.getBytes()[0] = (byte) (nextCode & 0x3);
- } else {
- this.kmer.getBytes()[0] = (byte) (kmer.getBytes()[0] | ((nextCode & 0x3) << ((kmer.getKmerLength() % 4) << 1)));
- }
- return this.kmer;
- }
-
- /**
- * Merge kmer with previous neighbor in gene-code format.
- * The k of new kmer will increase by 1
- * e.g. AAGCT merge with A => AAAGCT
- * @param k :input k of kmer
- * @param kmer : input bytes of kmer
- * @param preCode: next neighbor in gene-code format
- * @return the merged Kmer,this K of this Kmer is k+1
- */
- public VKmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
- this.kmer.reset(kmer.getKmerLength()+1);
- int byteInMergedKmer = 0;
- if (kmer.getKmerLength() % 4 == 0) {
- this.kmer.getBytes()[0] = (byte) ((kmer.getBytes()[0] >> 6) & 0x3);
- byteInMergedKmer++;
- }
- for (int i = 0; i < kmer.getLength() - 1; i++, byteInMergedKmer++) {
- this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[i] << 2) | ((kmer.getBytes()[ i + 1] >> 6) & 0x3));
- }
- this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[kmer.getLength() - 1] << 2) | (preCode & 0x3));
- return this.kmer;
- }
-
- /**
- * Merge two kmer to one kmer
- * e.g. ACTA + ACCGT => ACTAACCGT
- * @param preK : previous k of kmer
- * @param kmerPre : bytes array of previous kmer
- * @param nextK : next k of kmer
- * @param kmerNext : bytes array of next kmer
- * @return merged kmer, the new k is @preK + @nextK
- */
- public VKmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
- kmer.reset(preKmer.getKmerLength() + nextKmer.getKmerLength());
- int i = 1;
- for (; i <= preKmer.getLength(); i++) {
- kmer.getBytes()[kmer.getLength() - i] = preKmer.getBytes()[preKmer.getLength() - i];
- }
- if ( i > 1){
- i--;
- }
- if (preKmer.getKmerLength() % 4 == 0) {
- for (int j = 1; j <= nextKmer.getLength(); j++) {
- kmer.getBytes()[kmer.getLength() - i - j] = nextKmer.getBytes()[nextKmer.getLength() - j];
- }
- } else {
- int posNeedToMove = ((preKmer.getKmerLength() % 4) << 1);
- kmer.getBytes()[kmer.getLength() - i] |= nextKmer.getBytes()[ nextKmer.getLength() - 1] << posNeedToMove;
- for (int j = 1; j < nextKmer.getLength(); j++) {
- kmer.getBytes()[kmer.getLength() - i - j] = (byte) (((nextKmer.getBytes()[ nextKmer.getLength()
- - j] & 0xff) >> (8 - posNeedToMove)) | (nextKmer.getBytes()[nextKmer.getLength()
- - j - 1] << posNeedToMove));
- }
- if ( nextKmer.getKmerLength() % 4 == 0 || (nextKmer.getKmerLength() % 4) * 2 + posNeedToMove > 8) {
- kmer.getBytes()[0] = (byte) ((0xff & nextKmer.getBytes()[0] )>> (8 - posNeedToMove));
- }
- }
- return kmer;
- }
-
- /**
- * Safely shifted the kmer forward without change the input kmer
- * e.g. AGCGC shift with T => GCGCT
- * @param k: kmer length
- * @param kmer: input kmer
- * @param afterCode: input genecode
- * @return new created kmer that shifted by afterCode, the K will not change
- */
- public VKmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode){
- this.kmer.set(kmer);
- this.kmer.shiftKmerWithNextCode(afterCode);
- return this.kmer;
- }
-
- /**
- * Safely shifted the kmer backward without change the input kmer
- * e.g. AGCGC shift with T => TAGCG
- * @param k: kmer length
- * @param kmer: input kmer
- * @param preCode: input genecode
- * @return new created kmer that shifted by preCode, the K will not change
- */
- public VKmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode){
- this.kmer.set(kmer);
- this.kmer.shiftKmerWithPreCode(preCode);
- return this.kmer;
- }
-
- /**
- * get the reverse sequence of given kmer
- * @param kmer
- */
- public VKmerBytesWritable reverse(final KmerBytesWritable kmer) {
- this.kmer.reset(kmer.getKmerLength());
+ /** from end to start */
+ int byteInChain = kmerChain.getLength() - 1 - (kmerChain.getKmerLength() - lastK) / 4;
+ int posInByteOfChain = ((kmerChain.getKmerLength() - lastK) % 4) << 1; // *2
+ int byteInKmer = kmer.getLength() - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
- int curPosAtKmer = ((kmer.getKmerLength() - 1) % 4) << 1;
- int curByteAtKmer = 0;
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ }
+ return kmer;
+ }
- int curPosAtReverse = 0;
- int curByteAtReverse = this.kmer.getLength() - 1;
- this.kmer.getBytes()[curByteAtReverse] = 0;
- for (int i = 0; i < kmer.getKmerLength(); i++) {
- byte gene = (byte) ((kmer.getBytes()[curByteAtKmer] >> curPosAtKmer) & 0x03);
- this.kmer.getBytes()[curByteAtReverse] |= gene << curPosAtReverse;
- curPosAtReverse += 2;
- if (curPosAtReverse >= 8) {
- curPosAtReverse = 0;
- this.kmer.getBytes()[--curByteAtReverse] = 0;
- }
- curPosAtKmer -= 2;
- if (curPosAtKmer < 0) {
- curPosAtKmer = 6;
- curByteAtKmer++;
- }
- }
- return this.kmer;
- }
+ /**
+ * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
+ * return AAGCT
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return FirstKmer bytes array
+ */
+ public VKmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
+ if (firstK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (firstK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(firstK);
+
+ int i = 1;
+ for (; i < kmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ }
+ int posInByteOfChain = (firstK % 4) << 1; // *2
+ if (posInByteOfChain == 0) {
+ kmer.getBytes()[0] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ } else {
+ kmer.getBytes()[0] = (byte) (kmerChain.getBytes()[kmerChain.getLength() - i] & ((1 << posInByteOfChain) - 1));
+ }
+ return kmer;
+ }
+
+ /**
+ * Merge kmer with next neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAGCTA
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param nextCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer, this K of this Kmer is k+1
+ */
+ public VKmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
+ this.kmer.reset(kmer.getKmerLength() + 1);
+ for (int i = 1; i <= kmer.getLength(); i++) {
+ this.kmer.getBytes()[this.kmer.getLength() - i] = kmer.getBytes()[kmer.getLength() - i];
+ }
+ if (this.kmer.getLength() > kmer.getLength()) {
+ this.kmer.getBytes()[0] = (byte) (nextCode & 0x3);
+ } else {
+ this.kmer.getBytes()[0] = (byte) (kmer.getBytes()[0] | ((nextCode & 0x3) << ((kmer.getKmerLength() % 4) << 1)));
+ }
+ return this.kmer;
+ }
+
+ /**
+ * Merge kmer with previous neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAAGCT
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param preCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer,this K of this Kmer is k+1
+ */
+ public VKmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ this.kmer.reset(kmer.getKmerLength() + 1);
+ int byteInMergedKmer = 0;
+ if (kmer.getKmerLength() % 4 == 0) {
+ this.kmer.getBytes()[0] = (byte) ((kmer.getBytes()[0] >> 6) & 0x3);
+ byteInMergedKmer++;
+ }
+ for (int i = 0; i < kmer.getLength() - 1; i++, byteInMergedKmer++) {
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[i] << 2) | ((kmer.getBytes()[i + 1] >> 6) & 0x3));
+ }
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[kmer.getLength() - 1] << 2) | (preCode & 0x3));
+ return this.kmer;
+ }
+
+ /**
+ * Merge two kmer to one kmer
+ * e.g. ACTA + ACCGT => ACTAACCGT
+ *
+ * @param preK
+ * : previous k of kmer
+ * @param kmerPre
+ * : bytes array of previous kmer
+ * @param nextK
+ * : next k of kmer
+ * @param kmerNext
+ * : bytes array of next kmer
+ * @return merged kmer, the new k is @preK + @nextK
+ */
+ public VKmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
+ kmer.reset(preKmer.getKmerLength() + nextKmer.getKmerLength());
+ int i = 1;
+ for (; i <= preKmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = preKmer.getBytes()[preKmer.getLength() - i];
+ }
+ if (i > 1) {
+ i--;
+ }
+ if (preKmer.getKmerLength() % 4 == 0) {
+ for (int j = 1; j <= nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = nextKmer.getBytes()[nextKmer.getLength() - j];
+ }
+ } else {
+ int posNeedToMove = ((preKmer.getKmerLength() % 4) << 1);
+ kmer.getBytes()[kmer.getLength() - i] |= nextKmer.getBytes()[nextKmer.getLength() - 1] << posNeedToMove;
+ for (int j = 1; j < nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = (byte) (((nextKmer.getBytes()[nextKmer.getLength() - j] & 0xff) >> (8 - posNeedToMove)) | (nextKmer
+ .getBytes()[nextKmer.getLength() - j - 1] << posNeedToMove));
+ }
+ if (nextKmer.getKmerLength() % 4 == 0 || (nextKmer.getKmerLength() % 4) * 2 + posNeedToMove > 8) {
+ kmer.getBytes()[0] = (byte) ((0xff & nextKmer.getBytes()[0]) >> (8 - posNeedToMove));
+ }
+ }
+ return kmer;
+ }
+
+ /**
+ * Safely shifted the kmer forward without change the input kmer
+ * e.g. AGCGC shift with T => GCGCT
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param afterCode
+ * : input genecode
+ * @return new created kmer that shifted by afterCode, the K will not change
+ */
+ public VKmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode) {
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithNextCode(afterCode);
+ return this.kmer;
+ }
+
+ /**
+ * Safely shifted the kmer backward without change the input kmer
+ * e.g. AGCGC shift with T => TAGCG
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param preCode
+ * : input genecode
+ * @return new created kmer that shifted by preCode, the K will not change
+ */
+ public VKmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithPreCode(preCode);
+ return this.kmer;
+ }
+
+ /**
+ * get the reverse sequence of given kmer
+ *
+ * @param kmer
+ */
+ public VKmerBytesWritable reverse(final KmerBytesWritable kmer) {
+ this.kmer.reset(kmer.getKmerLength());
+
+ int curPosAtKmer = ((kmer.getKmerLength() - 1) % 4) << 1;
+ int curByteAtKmer = 0;
+
+ int curPosAtReverse = 0;
+ int curByteAtReverse = this.kmer.getLength() - 1;
+ this.kmer.getBytes()[curByteAtReverse] = 0;
+ for (int i = 0; i < kmer.getKmerLength(); i++) {
+ byte gene = (byte) ((kmer.getBytes()[curByteAtKmer] >> curPosAtKmer) & 0x03);
+ this.kmer.getBytes()[curByteAtReverse] |= gene << curPosAtReverse;
+ curPosAtReverse += 2;
+ if (curPosAtReverse >= 8) {
+ curPosAtReverse = 0;
+ this.kmer.getBytes()[--curByteAtReverse] = 0;
+ }
+ curPosAtKmer -= 2;
+ if (curPosAtKmer < 0) {
+ curPosAtKmer = 6;
+ curByteAtKmer++;
+ }
+ }
+ return this.kmer;
+ }
}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
index ea1d0c2..f21da91 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
@@ -8,73 +8,71 @@
import edu.uci.ics.genomix.type.KmerBytesWritable;
public class KmerBytesWritableTest {
- static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
- static int k = 7;
-
- @Test
- public void TestCompressKmer() {
- KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead( array, 0);
- Assert.assertEquals(kmer.toString(), "AATAGAA");
-
- kmer.setByRead( array, 1);
- Assert.assertEquals(kmer.toString(), "ATAGAAG");
- }
-
- @Test
- public void TestMoveKmer(){
- KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead( array, 0);
- Assert.assertEquals(kmer.toString(), "AATAGAA");
-
- for (int i = k; i < array.length-1; i++) {
- kmer.shiftKmerWithNextCode(array[i]);
- Assert.assertTrue(false);
- }
+ static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
+ static int k = 7;
- byte out = kmer.shiftKmerWithNextChar( array[array.length - 1]);
- Assert.assertEquals(out, GeneCode.getAdjBit((byte) 'A'));
- Assert.assertEquals(kmer.toString(), "ATAGAAG");
- }
-
-
- @Test
- public void TestCompressKmerReverse() {
- KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead( array, 0);
- Assert.assertEquals(kmer.toString(), "AATAGAA");
-
- kmer.setByReadReverse( array, 1);
- Assert.assertEquals(kmer.toString(), "GAAGATA");
- }
-
- @Test
- public void TestMoveKmerReverse(){
- KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead( array, 0);
- Assert.assertEquals(kmer.toString(), "AATAGAA");
-
- for (int i = k; i < array.length-1; i++) {
- kmer.shiftKmerWithPreChar( array[i]);
- Assert.assertTrue(false);
- }
+ @Test
+ public void TestCompressKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
- byte out = kmer.shiftKmerWithPreChar(array[array.length - 1]);
- Assert.assertEquals(out, GeneCode.getAdjBit((byte) 'A'));
- Assert.assertEquals(kmer.toString(), "GAATAGA");
- }
+ kmer.setByRead(array, 1);
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
- @Test
- public void TestGetGene(){
- KmerBytesWritable kmer = new KmerBytesWritable(9);
- String text = "AGCTGACCG";
- byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G' };
- kmer.setByRead( array, 0);
-
- for(int i =0; i < 9; i++){
- Assert.assertEquals(text.charAt(i),
- (char)(GeneCode.getSymbolFromCode(kmer.getGeneCodeAtPosition(i))));
- }
- }
+ @Test
+ public void TestMoveKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithNextCode(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithNextChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getAdjBit((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestCompressKmerReverse() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByReadReverse(array, 1);
+ Assert.assertEquals(kmer.toString(), "GAAGATA");
+ }
+
+ @Test
+ public void TestMoveKmerReverse() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithPreChar(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithPreChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getAdjBit((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "GAATAGA");
+ }
+
+ @Test
+ public void TestGetGene() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ String text = "AGCTGACCG";
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G' };
+ kmer.setByRead(array, 0);
+
+ for (int i = 0; i < 9; i++) {
+ Assert.assertEquals(text.charAt(i), (char) (GeneCode.getSymbolFromCode(kmer.getGeneCodeAtPosition(i))));
+ }
+ }
}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
index a0b8845..c40729c 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
@@ -9,150 +9,150 @@
import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
public class VKmerBytesWritableFactoryTest {
- static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G','T'};
-
- VKmerBytesWritableFactory kmerFactory = new VKmerBytesWritableFactory(8);
-
- @Test
- public void TestDegree(){
- Assert.assertTrue(GeneCode.inDegree((byte) 0xff) == 4);
- Assert.assertTrue(GeneCode.outDegree((byte) 0xff) == 4);
- Assert.assertTrue(GeneCode.inDegree((byte) 0x3f) == 2);
- Assert.assertTrue(GeneCode.outDegree((byte) 0x01) == 1);
- Assert.assertTrue(GeneCode.inDegree((byte) 0x01) == 0);
- }
-
- @Test
- public void TestGetLastKmer(){
- KmerBytesWritable kmer = new KmerBytesWritable(9);
- kmer.setByRead( array, 0);
- Assert.assertEquals("AGCTGACCG", kmer.toString());
- KmerBytesWritable lastKmer ;
- for(int i = 8; i>0 ; i--){
- lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
- Assert.assertEquals("AGCTGACCG".substring(9-i), lastKmer.toString());
- }
- VKmerBytesWritable vlastKmer ;
- for(int i = 8; i>0 ; i--){
- vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
- Assert.assertEquals("AGCTGACCG".substring(9-i), vlastKmer.toString());
- }
- }
-
- @Test
- public void TestMergeNext(){
- KmerBytesWritable kmer = new KmerBytesWritable(9);
- kmer.setByRead(array, 0);
- Assert.assertEquals("AGCTGACCG", kmer.toString());
-
- String text = "AGCTGACCG";
- for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
- text = text + (char)GeneCode.GENE_SYMBOL[x];
- Assert.assertEquals(text, newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
- }
- for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
- text = text + (char)GeneCode.GENE_SYMBOL[x];
- Assert.assertEquals(text, newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
- }
- }
-
- @Test
- public void TestMergePre(){
- KmerBytesWritable kmer = new KmerBytesWritable(9);
- kmer.setByRead(array, 0);
- Assert.assertEquals("AGCTGACCG", kmer.toString());
- String text = "AGCTGACCG";
- for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
- text = (char)GeneCode.GENE_SYMBOL[x] + text;
- Assert.assertEquals(text , newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
- }
- for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
- text = (char)GeneCode.GENE_SYMBOL[x] + text;
- Assert.assertEquals(text , newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
- }
- }
-
- @Test
- public void TestMergeTwoKmer(){
- KmerBytesWritable kmer1 = new KmerBytesWritable(9);
- kmer1.setByRead( array, 0);
- String text1 = "AGCTGACCG";
- KmerBytesWritable kmer2 = new KmerBytesWritable(9);
- kmer2.setByRead(array, 1);
- String text2 = "GCTGACCGT";
- Assert.assertEquals(text1, kmer1.toString());
- Assert.assertEquals(text2, kmer2.toString());
-
- KmerBytesWritable merged = kmerFactory.mergeTwoKmer(kmer1, kmer2);
- Assert.assertEquals(text1+text2, merged.toString());
-
- KmerBytesWritable kmer3 = new KmerBytesWritable(3);
- kmer3.setByRead(array, 1);
- String text3 = "GCT";
- Assert.assertEquals(text3, kmer3.toString());
-
- merged = kmerFactory.mergeTwoKmer(kmer1, kmer3);
- Assert.assertEquals(text1+text3, merged.toString());
- merged = kmerFactory.mergeTwoKmer( kmer3, kmer1);
- Assert.assertEquals(text3+text1, merged.toString());
-
- KmerBytesWritable kmer4 = new KmerBytesWritable(8);
- kmer4.setByRead( array, 0);
- String text4 = "AGCTGACC";
- Assert.assertEquals(text4, kmer4.toString());
- merged = kmerFactory.mergeTwoKmer(kmer4, kmer3);
- Assert.assertEquals(text4+text3, merged.toString());
-
- KmerBytesWritable kmer5 = new KmerBytesWritable(7);
- kmer5.setByRead( array, 0);
- String text5 = "AGCTGAC";
- VKmerBytesWritable kmer6 = new VKmerBytesWritable(9);
- kmer6.setByRead(9, array, 1);
- String text6 = "GCTGACCGT";
- merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
- Assert.assertEquals(text5+text6, merged.toString());
-
- kmer6.setByRead(6, array, 1);
- String text7 = "GCTGAC";
- merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
- Assert.assertEquals(text5+text7, merged.toString());
-
- kmer6.setByRead(4, array, 1);
- String text8 = "GCTG";
- merged = kmerFactory.mergeTwoKmer( kmer5, kmer6);
- Assert.assertEquals(text5+text8, merged.toString());
+ static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
- }
- @Test
- public void TestShift(){
- VKmerBytesWritable kmer = new VKmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
- String text = "AGCTGACCG";
- Assert.assertEquals(text, kmer.toString());
-
- VKmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer,GeneCode.A);
- Assert.assertEquals(text, kmer.toString());
- Assert.assertEquals("GCTGACCGA", kmerForward.toString());
- VKmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer,GeneCode.C);
- Assert.assertEquals(text, kmer.toString());
- Assert.assertEquals("CAGCTGACC", kmerBackward.toString());
-
- }
+ VKmerBytesWritableFactory kmerFactory = new VKmerBytesWritableFactory(8);
-
- @Test
- public void TestReverseKmer(){
- KmerBytesWritable kmer = new KmerBytesWritable(7);
- kmer.setByRead( array, 0);
- Assert.assertEquals(kmer.toString(), "AGCTGAC");
- KmerBytesWritable reversed = kmerFactory.reverse(kmer);
- Assert.assertEquals(reversed.toString(), "CAGTCGA");
- }
+ @Test
+ public void TestDegree() {
+ Assert.assertTrue(GeneCode.inDegree((byte) 0xff) == 4);
+ Assert.assertTrue(GeneCode.outDegree((byte) 0xff) == 4);
+ Assert.assertTrue(GeneCode.inDegree((byte) 0x3f) == 2);
+ Assert.assertTrue(GeneCode.outDegree((byte) 0x01) == 1);
+ Assert.assertTrue(GeneCode.inDegree((byte) 0x01) == 0);
+ }
+
+ @Test
+ public void TestGetLastKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ KmerBytesWritable lastKmer;
+ for (int i = 8; i > 0; i--) {
+ lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
+ }
+ VKmerBytesWritable vlastKmer;
+ for (int i = 8; i > 0; i--) {
+ vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
+ }
+ }
+
+ @Test
+ public void TestMergeNext() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+
+ String text = "AGCTGACCG";
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ text = text + (char) GeneCode.GENE_SYMBOL[x];
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ text = text + (char) GeneCode.GENE_SYMBOL[x];
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ }
+
+ @Test
+ public void TestMergePre() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ String text = "AGCTGACCG";
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ text = (char) GeneCode.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ text = (char) GeneCode.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ }
+
+ @Test
+ public void TestMergeTwoKmer() {
+ KmerBytesWritable kmer1 = new KmerBytesWritable(9);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACCG";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(9);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCGT";
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+
+ KmerBytesWritable merged = kmerFactory.mergeTwoKmer(kmer1, kmer2);
+ Assert.assertEquals(text1 + text2, merged.toString());
+
+ KmerBytesWritable kmer3 = new KmerBytesWritable(3);
+ kmer3.setByRead(array, 1);
+ String text3 = "GCT";
+ Assert.assertEquals(text3, kmer3.toString());
+
+ merged = kmerFactory.mergeTwoKmer(kmer1, kmer3);
+ Assert.assertEquals(text1 + text3, merged.toString());
+ merged = kmerFactory.mergeTwoKmer(kmer3, kmer1);
+ Assert.assertEquals(text3 + text1, merged.toString());
+
+ KmerBytesWritable kmer4 = new KmerBytesWritable(8);
+ kmer4.setByRead(array, 0);
+ String text4 = "AGCTGACC";
+ Assert.assertEquals(text4, kmer4.toString());
+ merged = kmerFactory.mergeTwoKmer(kmer4, kmer3);
+ Assert.assertEquals(text4 + text3, merged.toString());
+
+ KmerBytesWritable kmer5 = new KmerBytesWritable(7);
+ kmer5.setByRead(array, 0);
+ String text5 = "AGCTGAC";
+ VKmerBytesWritable kmer6 = new VKmerBytesWritable(9);
+ kmer6.setByRead(9, array, 1);
+ String text6 = "GCTGACCGT";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5 + text6, merged.toString());
+
+ kmer6.setByRead(6, array, 1);
+ String text7 = "GCTGAC";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5 + text7, merged.toString());
+
+ kmer6.setByRead(4, array, 1);
+ String text8 = "GCTG";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5 + text8, merged.toString());
+
+ }
+
+ @Test
+ public void TestShift() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
+ String text = "AGCTGACCG";
+ Assert.assertEquals(text, kmer.toString());
+
+ VKmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer, GeneCode.A);
+ Assert.assertEquals(text, kmer.toString());
+ Assert.assertEquals("GCTGACCGA", kmerForward.toString());
+ VKmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer, GeneCode.C);
+ Assert.assertEquals(text, kmer.toString());
+ Assert.assertEquals("CAGCTGACC", kmerBackward.toString());
+
+ }
+
+ @Test
+ public void TestReverseKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(7);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AGCTGAC");
+ KmerBytesWritable reversed = kmerFactory.reverse(kmer);
+ Assert.assertEquals(reversed.toString(), "CAGTCGA");
+ }
}
diff --git a/genomix/genomix-data/src/test/resources/hadoop/conf/core-site.xml b/genomix/genomix-data/src/test/resources/hadoop/conf/core-site.xml
index 47dfac5..3e5bacb 100644
--- a/genomix/genomix-data/src/test/resources/hadoop/conf/core-site.xml
+++ b/genomix/genomix-data/src/test/resources/hadoop/conf/core-site.xml
@@ -5,14 +5,14 @@
<configuration>
-<property>
- <name>fs.default.name</name>
- <value>hdfs://127.0.0.1:31888</value>
-</property>
-<property>
- <name>hadoop.tmp.dir</name>
- <value>/tmp/hadoop</value>
-</property>
+ <property>
+ <name>fs.default.name</name>
+ <value>hdfs://127.0.0.1:31888</value>
+ </property>
+ <property>
+ <name>hadoop.tmp.dir</name>
+ <value>/tmp/hadoop</value>
+ </property>
</configuration>
diff --git a/genomix/genomix-data/src/test/resources/hadoop/conf/hdfs-site.xml b/genomix/genomix-data/src/test/resources/hadoop/conf/hdfs-site.xml
index 8d29b1d..b1b1902 100644
--- a/genomix/genomix-data/src/test/resources/hadoop/conf/hdfs-site.xml
+++ b/genomix/genomix-data/src/test/resources/hadoop/conf/hdfs-site.xml
@@ -5,14 +5,14 @@
<configuration>
-<property>
- <name>dfs.replication</name>
- <value>1</value>
-</property>
+ <property>
+ <name>dfs.replication</name>
+ <value>1</value>
+ </property>
-<property>
- <name>dfs.block.size</name>
- <value>65536</value>
-</property>
+ <property>
+ <name>dfs.block.size</name>
+ <value>65536</value>
+ </property>
</configuration>
diff --git a/genomix/genomix-data/src/test/resources/hadoop/conf/mapred-site.xml b/genomix/genomix-data/src/test/resources/hadoop/conf/mapred-site.xml
index 39b6505..525e7d5 100644
--- a/genomix/genomix-data/src/test/resources/hadoop/conf/mapred-site.xml
+++ b/genomix/genomix-data/src/test/resources/hadoop/conf/mapred-site.xml
@@ -5,21 +5,21 @@
<configuration>
- <property>
- <name>mapred.job.tracker</name>
- <value>localhost:29007</value>
- </property>
- <property>
- <name>mapred.tasktracker.map.tasks.maximum</name>
- <value>20</value>
- </property>
- <property>
- <name>mapred.tasktracker.reduce.tasks.maximum</name>
- <value>20</value>
- </property>
- <property>
- <name>mapred.max.split.size</name>
- <value>2048</value>
- </property>
+ <property>
+ <name>mapred.job.tracker</name>
+ <value>localhost:29007</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.map.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.reduce.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.max.split.size</name>
+ <value>2048</value>
+ </property>
</configuration>
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingMapper.java
index fe56e1a..dca3808 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingMapper.java
@@ -16,7 +16,6 @@
import java.io.IOException;
import org.apache.hadoop.io.ByteWritable;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
@@ -27,13 +26,12 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
-import edu.uci.ics.genomix.type.Kmer;
import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
@SuppressWarnings({ "unused", "deprecation" })
-public class ResultsCheckingMapper extends MapReduceBase implements Mapper<BytesWritable, KmerCountValue, Text, Text> {
- BytesWritable valWriter = new BytesWritable();
+public class ResultsCheckingMapper extends MapReduceBase implements Mapper<KmerBytesWritable, KmerCountValue, Text, Text> {
+ KmerBytesWritable valWriter;
private final static IntWritable one = new IntWritable(1);
public static Text textkey = new Text();
public static Text textvalue = new Text();
@@ -42,16 +40,16 @@
public void configure(JobConf job) {
KMER_SIZE = job.getInt("sizeKmer", 0);
+ valWriter= new KmerBytesWritable(KMER_SIZE);
}
@Override
- public void map(BytesWritable key, KmerCountValue value, OutputCollector<Text, Text> output, Reporter reporter)
+ public void map(KmerBytesWritable key, KmerCountValue value, OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
String filename = fileSplit.getPath().getName();
- byte[] bkey = key.getBytes();
- textkey.set(Kmer.recoverKmerFrom(KMER_SIZE, key.getBytes(), 0, key.getLength()) + "\t" + value.toString());
+ textkey.set(key.toString() + "\t" + value.toString());
textvalue.set(filename);
output.collect(textkey, textvalue);
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
index 5d39928..09a1bf1 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
@@ -18,12 +18,12 @@
import java.io.IOException;
import java.util.Iterator;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
/**
@@ -31,12 +31,12 @@
*/
@SuppressWarnings("deprecation")
public class GenomixCombiner extends MapReduceBase implements
- Reducer<BytesWritable, KmerCountValue, BytesWritable, KmerCountValue> {
- public KmerCountValue vaWriter = new KmerCountValue();
+ Reducer<KmerBytesWritable, KmerCountValue, KmerBytesWritable, KmerCountValue> {
+ private KmerCountValue vaWriter = new KmerCountValue();
@Override
- public void reduce(BytesWritable key, Iterator<KmerCountValue> values,
- OutputCollector<BytesWritable, KmerCountValue> output, Reporter reporter) throws IOException {
+ public void reduce(KmerBytesWritable key, Iterator<KmerCountValue> values,
+ OutputCollector<KmerBytesWritable, KmerCountValue> output, Reporter reporter) throws IOException {
byte groupByAdjList = 0;
int count = 0;
byte bytCount = 0;
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
index b90ab23..e2f0f85 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
@@ -19,7 +19,6 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
@@ -28,6 +27,8 @@
import org.apache.hadoop.mapred.TextInputFormat;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
/**
@@ -64,12 +65,12 @@
conf.setReducerClass(GenomixReducer.class);
conf.setCombinerClass(GenomixCombiner.class);
- conf.setMapOutputKeyClass(BytesWritable.class);
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
conf.setMapOutputValueClass(KmerCountValue.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
- conf.setOutputKeyClass(BytesWritable.class);
+ conf.setOutputKeyClass(KmerBytesWritable.class);
conf.setOutputValueClass(KmerCountValue.class);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
index cd8b7e3..d36be17 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
@@ -19,7 +19,6 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
@@ -28,8 +27,8 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
-import edu.uci.ics.genomix.type.Kmer;
-import edu.uci.ics.genomix.type.Kmer.GENE_CODE;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
/**
@@ -37,7 +36,7 @@
*/
@SuppressWarnings("deprecation")
public class GenomixMapper extends MapReduceBase implements
- Mapper<LongWritable, Text, BytesWritable, KmerCountValue> {
+ Mapper<LongWritable, Text, KmerBytesWritable, KmerCountValue> {
public class CurrenByte {
public byte curByte;
@@ -45,12 +44,14 @@
}
public static int KMER_SIZE;
- public KmerCountValue outputAdjList = new KmerCountValue();
- public BytesWritable outputKmer = new BytesWritable();
+ public KmerCountValue outputAdjList;
+ public KmerBytesWritable outputKmer;
@Override
public void configure(JobConf job) {
KMER_SIZE = Integer.parseInt(job.get("sizeKmer"));
+ outputAdjList = new KmerCountValue();
+ outputKmer = new KmerBytesWritable(KMER_SIZE);
}
/*succeed node
@@ -64,7 +65,7 @@
G 01000000 64
T 10000000 128*/
@Override
- public void map(LongWritable key, Text value, OutputCollector<BytesWritable, KmerCountValue> output,
+ public void map(LongWritable key, Text value, OutputCollector<KmerBytesWritable, KmerCountValue> output,
Reporter reporter) throws IOException {
/* A 00
C 01
@@ -78,28 +79,25 @@
/** first kmer */
byte count = 1;
byte[] array = geneLine.getBytes();
- byte[] kmer = Kmer.compressKmer(KMER_SIZE, array, 0);
+ outputKmer.setByRead( array, 0);
byte pre = 0;
- byte next = GENE_CODE.getAdjBit(array[KMER_SIZE]);
- byte adj = GENE_CODE.mergePreNextAdj(pre, next);
+ byte next = GeneCode.getAdjBit(array[KMER_SIZE]);
+ byte adj = GeneCode.mergePreNextAdj(pre, next);
outputAdjList.set(adj, count);
- outputKmer.set(kmer, 0, kmer.length);
output.collect(outputKmer, outputAdjList);
/** middle kmer */
for (int i = KMER_SIZE; i < array.length - 1; i++) {
- pre = Kmer.moveKmer(KMER_SIZE, kmer, array[i]);
- next = GENE_CODE.getAdjBit(array[i + 1]);
- adj = GENE_CODE.mergePreNextAdj(pre, next);
+ pre = outputKmer.shiftKmerWithNextChar(array[i]);
+ next = GeneCode.getAdjBit(array[i + 1]);
+ adj = GeneCode.mergePreNextAdj(pre, next);
outputAdjList.set(adj, count);
- outputKmer.set(kmer, 0, kmer.length);
output.collect(outputKmer, outputAdjList);
}
/** last kmer */
- pre = Kmer.moveKmer(KMER_SIZE, kmer, array[array.length - 1]);
+ pre = outputKmer.shiftKmerWithNextChar(array[array.length - 1]);
next = 0;
- adj = GENE_CODE.mergePreNextAdj(pre, next);
+ adj = GeneCode.mergePreNextAdj(pre, next);
outputAdjList.set(adj, count);
- outputKmer.set(kmer, 0, kmer.length);
output.collect(outputKmer, outputAdjList);
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
index 676d6f1..1fba709 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
@@ -17,12 +17,12 @@
import java.io.IOException;
import java.util.Iterator;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
/**
@@ -30,12 +30,12 @@
*/
@SuppressWarnings("deprecation")
public class GenomixReducer extends MapReduceBase implements
- Reducer<BytesWritable, KmerCountValue, BytesWritable, KmerCountValue> {
+ Reducer<KmerBytesWritable, KmerCountValue, KmerBytesWritable, KmerCountValue> {
KmerCountValue valWriter = new KmerCountValue();
static enum MyCounters { NUM_RECORDS };
@Override
- public void reduce(BytesWritable key, Iterator<KmerCountValue> values,
- OutputCollector<BytesWritable, KmerCountValue> output, Reporter reporter) throws IOException {
+ public void reduce(KmerBytesWritable key, Iterator<KmerCountValue> values,
+ OutputCollector<KmerBytesWritable, KmerCountValue> output, Reporter reporter) throws IOException {
byte groupByAdjList = 0;
int count = 0;
byte bytCount = 0;
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterDriver.java
index b3a6102..d54eca2 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterDriver.java
@@ -18,7 +18,6 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
@@ -28,6 +27,8 @@
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
@SuppressWarnings("deprecation")
public class CountFilterDriver {
private static class Options {
@@ -59,13 +60,13 @@
conf.setReducerClass(CountFilterReducer.class);
conf.setCombinerClass(CountFilterReducer.class);
- conf.setMapOutputKeyClass(BytesWritable.class);
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
conf.setMapOutputValueClass(ByteWritable.class);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
- conf.setOutputKeyClass(BytesWritable.class);
+ conf.setOutputKeyClass(KmerBytesWritable.class);
conf.setOutputValueClass(ByteWritable.class);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterMapper.java
index 80557e9..2a54217 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterMapper.java
@@ -15,33 +15,32 @@
package edu.uci.ics.graphcountfilter;
import java.io.IOException;
+
import org.apache.hadoop.io.ByteWritable;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
+
import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
-@SuppressWarnings({ "unused", "deprecation" })
+@SuppressWarnings({ "deprecation" })
public class CountFilterMapper extends MapReduceBase implements
- Mapper<BytesWritable, KmerCountValue, BytesWritable, ByteWritable> {
- public static int THRESHOLD;
- public BytesWritable outputKmer = new BytesWritable();
- public ByteWritable outputAdjList = new ByteWritable();
+ Mapper<KmerBytesWritable, KmerCountValue, KmerBytesWritable, ByteWritable> {
+ private int THRESHOLD;
+ private ByteWritable adjByte = new ByteWritable();
@Override
public void configure(JobConf job) {
THRESHOLD = Integer.parseInt(job.get("countThreshold"));
}
- public void map(BytesWritable key, KmerCountValue value, OutputCollector<BytesWritable, ByteWritable> output,
+ public void map(KmerBytesWritable key, KmerCountValue value, OutputCollector<KmerBytesWritable, ByteWritable> output,
Reporter reporter) throws IOException {
if(value.getCount() >= THRESHOLD){
- outputKmer.set(key);
- outputAdjList.set(value.getAdjBitMap());
- output.collect(outputKmer, outputAdjList);
+ adjByte.set(value.getAdjBitMap());
+ output.collect(key, adjByte );
}
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterReducer.java
index c692336..dd33451 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterReducer.java
@@ -17,18 +17,19 @@
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.ByteWritable;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
@SuppressWarnings("deprecation")
public class CountFilterReducer extends MapReduceBase implements
- Reducer<BytesWritable, ByteWritable, BytesWritable, ByteWritable> {
+ Reducer<KmerBytesWritable, ByteWritable, KmerBytesWritable, ByteWritable> {
@Override
- public void reduce(BytesWritable key, Iterator<ByteWritable> values,
- OutputCollector<BytesWritable, ByteWritable> output, Reporter reporter) throws IOException {
+ public void reduce(KmerBytesWritable key, Iterator<ByteWritable> values,
+ OutputCollector<KmerBytesWritable, ByteWritable> output, Reporter reporter) throws IOException {
output.collect(key, values.next()); //Output the Pair
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathDriver.java
index 5025a7b..cfdf8d5 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathDriver.java
@@ -17,7 +17,6 @@
import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
@@ -29,6 +28,8 @@
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
@SuppressWarnings("deprecation")
public class MergePathDriver {
@@ -67,13 +68,13 @@
conf.setMapperClass(SNodeInitialMapper.class);
conf.setReducerClass(SNodeInitialReducer.class);
- conf.setMapOutputKeyClass(BytesWritable.class);
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
conf.setMapOutputValueClass(MergePathValueWritable.class);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
- conf.setOutputKeyClass(BytesWritable.class);
+ conf.setOutputKeyClass(KmerBytesWritable.class);
conf.setOutputValueClass(MergePathValueWritable.class);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
@@ -97,7 +98,7 @@
conf.setMapperClass(MergePathMapper.class);
conf.setReducerClass(MergePathReducer.class);
- conf.setMapOutputKeyClass(BytesWritable.class);
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
conf.setMapOutputValueClass(MergePathValueWritable.class);
conf.setInputFormat(SequenceFileInputFormat.class);
@@ -107,14 +108,14 @@
String complete = "complete" + iMerge;
MultipleOutputs.addNamedOutput(conf, uncomplete,
- MergePathMultiSeqOutputFormat.class, BytesWritable.class,
+ MergePathMultiSeqOutputFormat.class, KmerBytesWritable.class,
MergePathValueWritable.class);
MultipleOutputs.addNamedOutput(conf, complete,
- MergePathMultiSeqOutputFormat.class, BytesWritable.class,
+ MergePathMultiSeqOutputFormat.class, KmerBytesWritable.class,
MergePathValueWritable.class);
- conf.setOutputKeyClass(BytesWritable.class);
+ conf.setOutputKeyClass(KmerBytesWritable.class);
conf.setOutputValueClass(MergePathValueWritable.class);
FileInputFormat.setInputPaths(conf, new Path(inputPath + "-step1"));
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMapper.java
index 1d772b2..c3255f2 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMapper.java
@@ -15,31 +15,37 @@
package edu.uci.ics.pathmerging;
import java.io.IOException;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
-import edu.uci.ics.genomix.type.Kmer;
-import edu.uci.ics.genomix.type.KmerUtil;
-import edu.uci.ics.genomix.type.Kmer.GENE_CODE;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
@SuppressWarnings("deprecation")
public class MergePathMapper extends MapReduceBase implements
- Mapper<BytesWritable, MergePathValueWritable, BytesWritable, MergePathValueWritable> {
- public static int KMER_SIZE;
- public BytesWritable outputKmer = new BytesWritable();
- public MergePathValueWritable outputAdjList = new MergePathValueWritable();
-
+ Mapper<KmerBytesWritable, MergePathValueWritable, KmerBytesWritable, MergePathValueWritable> {
+ private int KMER_SIZE;
+ private VKmerBytesWritableFactory outputKmerFactory;
+ private MergePathValueWritable outputAdjList;
+ private VKmerBytesWritable tmpKmer;
+ private VKmerBytesWritable outputKmer;
public void configure(JobConf job) {
KMER_SIZE = job.getInt("sizeKmer", 0);
+ outputKmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputAdjList = new MergePathValueWritable();
+ tmpKmer = new VKmerBytesWritable(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
}
@Override
- public void map(BytesWritable key, MergePathValueWritable value,
- OutputCollector<BytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ public void map(KmerBytesWritable key, MergePathValueWritable value,
+ OutputCollector<KmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
byte precursor = (byte) 0xF0;
byte succeed = (byte) 0x0F;
@@ -49,24 +55,18 @@
precursor = (byte) ((precursor & 0xff) >> 4);
succeed = (byte) (succeed & adjBitMap);
- byte[] kmerValue = key.getBytes();
- int kmerLength = key.getLength();
if (bitFlag == 1) {
- byte succeedCode = GENE_CODE.getGeneCodeFromBitMap(succeed);
- int originalByteNum = Kmer.getByteNumFromK(KMER_SIZE);
- byte[] tmpKmer = KmerUtil.getLastKmerFromChain(KMER_SIZE, value.getKmerSize(), kmerValue, 0, kmerLength);
- byte[] newKmer = KmerUtil.shiftKmerWithNextCode(KMER_SIZE, tmpKmer,0, tmpKmer.length, succeedCode);
- outputKmer.set(newKmer, 0, originalByteNum);
+ byte succeedCode = GeneCode.getGeneCodeFromBitMap(succeed);
+ tmpKmer.set(outputKmerFactory.getLastKmerFromChain(KMER_SIZE, key));
+ outputKmer.set(outputKmerFactory.shiftKmerWithNextCode(tmpKmer, succeedCode));
- int mergeByteNum = Kmer.getByteNumFromK(value.getKmerSize() - (KMER_SIZE - 1));
- byte[] mergeKmer = KmerUtil.getFirstKmerFromChain(value.getKmerSize() - (KMER_SIZE - 1),
- value.getKmerSize(), kmerValue, 0, kmerLength);
- outputAdjList.set(mergeKmer, 0, mergeByteNum, adjBitMap, bitFlag, value.getKmerSize() - (KMER_SIZE - 1));
+ KmerBytesWritable mergedKmer = outputKmerFactory.getFirstKmerFromChain(value.getKmerSize()
+ - (KMER_SIZE - 1), value.getKmer());
+ outputAdjList.set(mergedKmer, adjBitMap, bitFlag);
output.collect(outputKmer, outputAdjList);
} else {
- outputKmer.set(key);
outputAdjList.set(value);
- output.collect(outputKmer, outputAdjList);
+ output.collect(key, outputAdjList);
}
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathReducer.java
index 2397a98..cead0e8 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathReducer.java
@@ -16,7 +16,6 @@
import java.io.IOException;
import java.util.Iterator;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
@@ -24,53 +23,52 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.MultipleOutputs;
-import edu.uci.ics.genomix.type.Kmer;
-import edu.uci.ics.genomix.type.KmerUtil;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
@SuppressWarnings("deprecation")
public class MergePathReducer extends MapReduceBase implements
- Reducer<BytesWritable, MergePathValueWritable, BytesWritable, MergePathValueWritable> {
- public BytesWritable outputKmer = new BytesWritable();
- public static int KMER_SIZE;
- public MergePathValueWritable outputAdjList = new MergePathValueWritable();
+ Reducer<KmerBytesWritable, MergePathValueWritable, KmerBytesWritable, MergePathValueWritable> {
+ private VKmerBytesWritableFactory kmerFactory;
+ private VKmerBytesWritable outputKmer;
+ private VKmerBytesWritable tmpKmer;
+ private int KMER_SIZE;
+ private MergePathValueWritable outputAdjList;
MultipleOutputs mos = null;
- public static int I_MERGE;
+ private int I_MERGE;
public void configure(JobConf job) {
mos = new MultipleOutputs(job);
I_MERGE = Integer.parseInt(job.get("iMerge"));
KMER_SIZE = job.getInt("sizeKmer", 0);
+ outputAdjList = new MergePathValueWritable();
+ kmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
+ tmpKmer = new VKmerBytesWritable(KMER_SIZE);
}
@SuppressWarnings("unchecked")
@Override
- public void reduce(BytesWritable key, Iterator<MergePathValueWritable> values,
- OutputCollector<BytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ public void reduce(KmerBytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<KmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
outputAdjList = values.next();
+
if (values.hasNext() == true) {
- byte[] keyBytes = key.getBytes();
- int keyLength = key.getLength();
if (outputAdjList.getFlag() == 1) {
byte adjBitMap = outputAdjList.getAdjBitMap();
byte bitFlag = outputAdjList.getFlag();
- int kmerSize = outputAdjList.getKmerSize();
- int mergeByteNum = Kmer.getByteNumFromK(KMER_SIZE + kmerSize);
- byte[] valueBytes = outputAdjList.getBytes();
- int valueLength = outputAdjList.getLength();
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputAdjList.getKmer(), key));
- byte[] mergeKmer = KmerUtil.mergeTwoKmer(outputAdjList.getKmerSize(), valueBytes,0, valueLength,
- KMER_SIZE, keyBytes, 0, keyLength);
- outputKmer.set(mergeKmer, 0, mergeByteNum);
-
outputAdjList = values.next();
byte nextAdj = outputAdjList.getAdjBitMap();
byte succeed = (byte) 0x0F;
succeed = (byte) (succeed & nextAdj);
adjBitMap = (byte) (adjBitMap & 0xF0);
adjBitMap = (byte) (adjBitMap | succeed);
- outputAdjList.set(null, 0, 0, adjBitMap, bitFlag, KMER_SIZE + kmerSize);
+ outputAdjList.set(null, 0, 0, adjBitMap, bitFlag, KMER_SIZE + outputAdjList.getKmerSize());
mos.getCollector("uncomplete" + I_MERGE, reporter).collect(outputKmer, outputAdjList);
} else {
@@ -81,13 +79,8 @@
byte adjBitMap = outputAdjList.getAdjBitMap();
byte flag = outputAdjList.getFlag();
int kmerSize = outputAdjList.getKmerSize();
- int mergeByteNum = Kmer.getByteNumFromK(KMER_SIZE + kmerSize);
- byte[] valueBytes = outputAdjList.getBytes();
- int valueLength =outputAdjList.getLength();
- byte[] mergeKmer = KmerUtil.mergeTwoKmer(outputAdjList.getKmerSize(), valueBytes, 0, valueLength,
- KMER_SIZE, keyBytes, 0, keyLength);
- outputKmer.set(mergeKmer, 0, mergeByteNum);
-
+
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputAdjList.getKmer(), key));
adjBitMap = (byte) (adjBitMap & 0xF0);
adjBitMap = (byte) (adjBitMap | succeed);
outputAdjList.set(null, 0, 0, adjBitMap, flag, KMER_SIZE + kmerSize);
@@ -95,19 +88,13 @@
mos.getCollector("uncomplete" + I_MERGE, reporter).collect(outputKmer, outputAdjList);
}
} else {
- byte[] keyBytes = key.getBytes();
- int keyLength = key.getLength();
if (outputAdjList.getFlag() != 0) {
byte adjBitMap = outputAdjList.getAdjBitMap();
byte flag = outputAdjList.getFlag();
int kmerSize = outputAdjList.getKmerSize();
- int mergeByteNum = Kmer.getByteNumFromK(KMER_SIZE - 1 + kmerSize);
- byte[] tmpKmer = KmerUtil.getFirstKmerFromChain(KMER_SIZE - 1, KMER_SIZE, keyBytes,0,keyLength);
- byte[] valueBytes = outputAdjList.getBytes();
- int valueLength = outputAdjList.getLength();
- byte[] mergeKmer = KmerUtil.mergeTwoKmer(outputAdjList.getKmerSize(), valueBytes,0, valueLength,
- KMER_SIZE - 1, tmpKmer,0,tmpKmer.length);
- outputKmer.set(mergeKmer, 0, mergeByteNum);
+
+ tmpKmer.set(kmerFactory.getFirstKmerFromChain(KMER_SIZE - 1, key));
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputAdjList.getKmer(), tmpKmer));
outputAdjList.set(null, 0, 0, adjBitMap, flag, KMER_SIZE + kmerSize);
mos.getCollector("complete" + I_MERGE, reporter).collect(outputKmer, outputAdjList);
} else
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathValueWritable.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathValueWritable.java
index f1dee39..c5ff116 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathValueWritable.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathValueWritable.java
@@ -14,13 +14,16 @@
*/
package edu.uci.ics.pathmerging;
-import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
+import java.io.IOException;
+
import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.WritableComparable;
-import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
public class MergePathValueWritable extends BinaryComparable implements WritableComparable<BinaryComparable> {
@@ -31,6 +34,8 @@
private byte adjBitMap;
private byte flag;
private int kmerSize;
+
+ private VKmerBytesWritable kmer;
public MergePathValueWritable() {
this((byte) 0, (byte) 0, (byte) 0, EMPTY_BYTES);
@@ -43,6 +48,8 @@
this.bytes = bytes;
this.size = bytes.length;
+ this.kmer = new VKmerBytesWritable(kmerSize);
+ kmer.set(bytes, 0, bytes.length);
}
public void setSize(int size) {
@@ -72,17 +79,29 @@
public void set(MergePathValueWritable newData) {
set(newData.bytes, 0, newData.size, newData.adjBitMap, newData.flag, newData.kmerSize);
}
+
+ public void set(KmerBytesWritable mergedKmer, byte adjBitMap, byte bitFlag) {
+ set(mergedKmer.getBytes(),0,mergedKmer.getLength(), adjBitMap, bitFlag, mergedKmer.getKmerLength());
+ }
public void set(byte[] newData, int offset, int length, byte adjBitMap, byte flag, int kmerSize) {
setSize(0);
if (length != 0) {
setSize(length);
System.arraycopy(newData, offset, bytes, 0, size);
+ kmer.set(kmerSize, newData, offset, length);
}
this.adjBitMap = adjBitMap;
this.flag = flag;
this.kmerSize = kmerSize;
}
+
+ public KmerBytesWritable getKmer(){
+ if (size != 0){
+ return kmer;
+ }
+ return null;
+ }
@Override
public void readFields(DataInput arg0) throws IOException {
@@ -91,6 +110,7 @@
setSize(arg0.readInt());
if(size != 0){
arg0.readFully(bytes, 0, size);
+ kmer.set(bytes,0,size);
}
adjBitMap = arg0.readByte();
flag = arg0.readByte();
@@ -145,6 +165,8 @@
}
sb.append(num);
}
- return Kmer.GENE_CODE.getSymbolFromBitMap(adjBitMap) + '\t' + String.valueOf(flag) + '\t' + sb.toString();
+ return GeneCode.getSymbolFromBitMap(adjBitMap) + '\t' + String.valueOf(flag) + '\t' + sb.toString();
}
+
+
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialMapper.java
index 2e25c0d..ae824e7 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialMapper.java
@@ -15,25 +15,28 @@
package edu.uci.ics.pathmerging;
import java.io.IOException;
+
import org.apache.hadoop.io.ByteWritable;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
-import edu.uci.ics.genomix.type.KmerUtil;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
@SuppressWarnings("deprecation")
public class SNodeInitialMapper extends MapReduceBase implements
- Mapper<BytesWritable, ByteWritable, BytesWritable, MergePathValueWritable> {
+ Mapper<KmerBytesWritable, ByteWritable, KmerBytesWritable, MergePathValueWritable> {
- public static int KMER_SIZE;
- public BytesWritable outputKmer = new BytesWritable();
- public MergePathValueWritable outputAdjList = new MergePathValueWritable();
+ public int KMER_SIZE;
+ public KmerBytesWritable outputKmer;
+ public MergePathValueWritable outputAdjList;
public void configure(JobConf job) {
KMER_SIZE = Integer.parseInt(job.get("sizeKmer"));
+ outputKmer = new KmerBytesWritable(KMER_SIZE);
+ outputAdjList = new MergePathValueWritable();
}
boolean measureDegree(byte adjacent) {
@@ -92,8 +95,8 @@
}
@Override
- public void map(BytesWritable key, ByteWritable value,
- OutputCollector<BytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ public void map(KmerBytesWritable key, ByteWritable value,
+ OutputCollector<KmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
byte precursor = (byte) 0xF0;
byte succeed = (byte) 0x0F;
@@ -105,8 +108,6 @@
boolean inDegree = measureDegree(precursor);
boolean outDegree = measureDegree(succeed);
byte initial = 0;
- byte[] kmerValue = key.getBytes();
- int kmerLength = key.getLength();
if (inDegree == true && outDegree == false) {
flag = (byte) 2;
switch (succeed) {
@@ -123,8 +124,8 @@
initial = (byte) 0x03;
break;
}
- byte[] newKmer = KmerUtil.shiftKmerWithNextCode(KMER_SIZE, kmerValue,0, kmerLength, initial);
- outputKmer.set(newKmer, 0, kmerValue.length);
+ outputKmer.set(key);
+ outputKmer.shiftKmerWithNextCode(initial);
adjBitMap = (byte) (adjBitMap & 0xF0);
outputAdjList.set(null, 0, 0, adjBitMap, flag, KMER_SIZE);
output.collect(outputKmer, outputAdjList);
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialReducer.java
index 44a0af2..734abd6 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialReducer.java
@@ -16,31 +16,31 @@
import java.io.IOException;
import java.util.Iterator;
-import org.apache.hadoop.io.BytesWritable;
+
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
@SuppressWarnings("deprecation")
public class SNodeInitialReducer extends MapReduceBase implements
- Reducer<BytesWritable, MergePathValueWritable, BytesWritable, MergePathValueWritable> {
- public BytesWritable outputKmer = new BytesWritable();
- public MergePathValueWritable outputAdjList = new MergePathValueWritable();
+ Reducer<KmerBytesWritable, MergePathValueWritable, KmerBytesWritable, MergePathValueWritable> {
+ private MergePathValueWritable outputAdjList = new MergePathValueWritable();
@Override
- public void reduce(BytesWritable key, Iterator<MergePathValueWritable> values,
- OutputCollector<BytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ public void reduce(KmerBytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<KmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
outputAdjList = values.next();
- outputKmer.set(key);
if (values.hasNext() == true) {
if (outputAdjList.getFlag() != 2) {
byte adjBitMap = outputAdjList.getAdjBitMap();
int kmerSize = outputAdjList.getKmerSize();
byte bitFlag = 1;
outputAdjList.set(null, 0, 0, adjBitMap, bitFlag, kmerSize);
- output.collect(outputKmer, outputAdjList);
-
+ output.collect(key, outputAdjList);
+
} else {
boolean flag = false;
while (values.hasNext()) {
@@ -55,12 +55,11 @@
int kmerSize = outputAdjList.getKmerSize();
byte bitFlag = 1;
outputAdjList.set(null, 0, 0, adjBitMap, bitFlag, kmerSize);
- output.collect(outputKmer, outputAdjList);
+ output.collect(key, outputAdjList);
}
}
- }
- else {
- output.collect(outputKmer, outputAdjList);
+ } else {
+ output.collect(key, outputAdjList);
}
}
}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
index 6bd3bd5..7b0005b 100755
--- a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
@@ -26,14 +26,13 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Test;
-import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
import edu.uci.ics.utils.TestUtils;
/**
@@ -72,13 +71,13 @@
SequenceFile.Reader reader = null;
Path path = new Path(RESULT_PATH + "/part-00000");
reader = new SequenceFile.Reader(dfs, path, conf);
- BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ KmerBytesWritable key = (KmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
File filePathTo = new File(TEST_SOURCE_DIR);
BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
while (reader.next(key, value)) {
- bw.write(Kmer.recoverKmerFrom(SIZE_KMER, key.getBytes(), 0, key.getLength()) + "\t" + value.toString());
+ bw.write(key.toString() + "\t" + value.toString());
bw.newLine();
}
bw.close();
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphcountfilter/CountFilterTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphcountfilter/CountFilterTest.java
index 4bf0be7..c09937c 100644
--- a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphcountfilter/CountFilterTest.java
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphcountfilter/CountFilterTest.java
@@ -13,13 +13,12 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.ByteWritable;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Test;
-import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.utils.TestUtils;
@@ -54,12 +53,12 @@
SequenceFile.Reader reader = null;
Path path = new Path(RESULT_PATH + "/part-00000");
reader = new SequenceFile.Reader(dfs, path, conf);
- BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ KmerBytesWritable key = (KmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
ByteWritable value = (ByteWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
File filePathTo = new File(TEST_SOURCE_DIR);
BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
while (reader.next(key, value)) {
- bw.write(Kmer.recoverKmerFrom(SIZE_KMER, key.getBytes(), 0, key.getLength()) + "\t" + value.toString());
+ bw.write(key.toString() + "\t" + value.toString());
bw.newLine();
}
bw.close();
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmerging/MergePathTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmerging/MergePathTest.java
index 7b8d285..95a9785 100644
--- a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmerging/MergePathTest.java
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmerging/MergePathTest.java
@@ -11,12 +11,13 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Test;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.utils.TestUtils;
@SuppressWarnings("deprecation")
@@ -52,7 +53,7 @@
SequenceFile.Reader reader = null;
Path path = new Path(HDFA_PATH_DATA + "/complete2" + "/complete2-r-00000");
reader = new SequenceFile.Reader(dfs, path, conf);
- BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ KmerBytesWritable key = (KmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
MergePathValueWritable value = (MergePathValueWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
File filePathTo = new File(TEST_SOURCE_DIR);
BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/ByteSerializerDeserializer.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/ByteSerializerDeserializer.java
index 98159c0..41e3e4b 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/ByteSerializerDeserializer.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/ByteSerializerDeserializer.java
@@ -7,41 +7,39 @@
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-public class ByteSerializerDeserializer implements
- ISerializerDeserializer<Byte> {
+public class ByteSerializerDeserializer implements ISerializerDeserializer<Byte> {
- private static final long serialVersionUID = 1L;
+ private static final long serialVersionUID = 1L;
- public static final ByteSerializerDeserializer INSTANCE = new ByteSerializerDeserializer();
+ public static final ByteSerializerDeserializer INSTANCE = new ByteSerializerDeserializer();
- private ByteSerializerDeserializer() {
- }
+ private ByteSerializerDeserializer() {
+ }
- @Override
- public Byte deserialize(DataInput in) throws HyracksDataException {
- try {
- return in.readByte();
- } catch (IOException e) {
- throw new HyracksDataException(e);
- }
- }
+ @Override
+ public Byte deserialize(DataInput in) throws HyracksDataException {
+ try {
+ return in.readByte();
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
- @Override
- public void serialize(Byte instance, DataOutput out)
- throws HyracksDataException {
- try {
- out.writeByte(instance.intValue());
- } catch (IOException e) {
- throw new HyracksDataException(e);
- }
- }
+ @Override
+ public void serialize(Byte instance, DataOutput out) throws HyracksDataException {
+ try {
+ out.writeByte(instance.intValue());
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
- public static byte getByte(byte[] bytes, int offset) {
- return bytes[offset];
- }
+ public static byte getByte(byte[] bytes, int offset) {
+ return bytes[offset];
+ }
- public static void putByte(byte val, byte[] bytes, int offset) {
- bytes[offset] = val;
- }
+ public static void putByte(byte val, byte[] bytes, int offset) {
+ bytes[offset] = val;
+ }
}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerBinaryHashFunctionFamily.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerBinaryHashFunctionFamily.java
index e7aa481..c1e9804 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerBinaryHashFunctionFamily.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerBinaryHashFunctionFamily.java
@@ -5,25 +5,25 @@
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFamily;
public class KmerBinaryHashFunctionFamily implements IBinaryHashFunctionFamily {
- private static final long serialVersionUID = 1L;
+ private static final long serialVersionUID = 1L;
- @Override
- public IBinaryHashFunction createBinaryHashFunction(final int seed) {
+ @Override
+ public IBinaryHashFunction createBinaryHashFunction(final int seed) {
- return new IBinaryHashFunction() {
- private KmerPointable p = new KmerPointable();
-
- @Override
- public int hash(byte[] bytes, int offset, int length) {
- if (length + offset >= bytes.length)
- throw new IllegalStateException("out of bound");
- p.set(bytes, offset, length);
- int hash = p.hash() * (seed + 1);
- if (hash < 0) {
- hash = -(hash+1);
- }
- return hash;
- }
- };
- }
+ return new IBinaryHashFunction() {
+ private KmerPointable p = new KmerPointable();
+
+ @Override
+ public int hash(byte[] bytes, int offset, int length) {
+ if (length + offset >= bytes.length)
+ throw new IllegalStateException("out of bound");
+ p.set(bytes, offset, length);
+ int hash = p.hash() * (seed + 1);
+ if (hash < 0) {
+ hash = -(hash + 1);
+ }
+ return hash;
+ }
+ };
+ }
}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerHashPartitioncomputerFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerHashPartitioncomputerFactory.java
index 231470a..86ca296 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerHashPartitioncomputerFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerHashPartitioncomputerFactory.java
@@ -6,39 +6,36 @@
import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputerFactory;
-public class KmerHashPartitioncomputerFactory implements
- ITuplePartitionComputerFactory {
+public class KmerHashPartitioncomputerFactory implements ITuplePartitionComputerFactory {
- private static final long serialVersionUID = 1L;
+ private static final long serialVersionUID = 1L;
- public static int hashBytes(byte[] bytes, int offset, int length) {
- int hash = 1;
- for (int i = offset; i < offset + length; i++)
- hash = (31 * hash) + (int) bytes[i];
- return hash;
- }
+ public static int hashBytes(byte[] bytes, int offset, int length) {
+ int hash = 1;
+ for (int i = offset; i < offset + length; i++)
+ hash = (31 * hash) + (int) bytes[i];
+ return hash;
+ }
- @Override
- public ITuplePartitionComputer createPartitioner() {
- return new ITuplePartitionComputer() {
- @Override
- public int partition(IFrameTupleAccessor accessor, int tIndex,
- int nParts) {
- int startOffset = accessor.getTupleStartOffset(tIndex);
- int fieldOffset = accessor.getFieldStartOffset(tIndex, 0);
- int slotLength = accessor.getFieldSlotsLength();
- int fieldLength = accessor.getFieldLength(tIndex, 0);
+ @Override
+ public ITuplePartitionComputer createPartitioner() {
+ return new ITuplePartitionComputer() {
+ @Override
+ public int partition(IFrameTupleAccessor accessor, int tIndex, int nParts) {
+ int startOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldOffset = accessor.getFieldStartOffset(tIndex, 0);
+ int slotLength = accessor.getFieldSlotsLength();
+ int fieldLength = accessor.getFieldLength(tIndex, 0);
- ByteBuffer buf = accessor.getBuffer();
+ ByteBuffer buf = accessor.getBuffer();
- int hash = hashBytes(buf.array(), startOffset + fieldOffset
- + slotLength, fieldLength);
- if (hash < 0){
- hash = -(hash+1);
- }
+ int hash = hashBytes(buf.array(), startOffset + fieldOffset + slotLength, fieldLength);
+ if (hash < 0) {
+ hash = -(hash + 1);
+ }
- return hash % nParts;
- }
- };
- }
+ return hash % nParts;
+ }
+ };
+ }
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerNormarlizedComputerFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerNormarlizedComputerFactory.java
index 1ca90c2..d1d0054 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerNormarlizedComputerFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerNormarlizedComputerFactory.java
@@ -4,20 +4,19 @@
import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputer;
import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputerFactory;
-public class KmerNormarlizedComputerFactory implements
- INormalizedKeyComputerFactory {
- private static final long serialVersionUID = 8735044913496854551L;
+public class KmerNormarlizedComputerFactory implements INormalizedKeyComputerFactory {
+ private static final long serialVersionUID = 1L;
- @Override
- public INormalizedKeyComputer createNormalizedKeyComputer() {
- return new INormalizedKeyComputer() {
- /**
- * read one int from Kmer, make sure this int is consistent whith Kmer compartor
- */
- @Override
- public int normalize(byte[] bytes, int start, int length) {
- return KmerPointable.getIntReverse(bytes, start, length);
- }
- };
- }
+ @Override
+ public INormalizedKeyComputer createNormalizedKeyComputer() {
+ return new INormalizedKeyComputer() {
+ /**
+ * read one int from Kmer, make sure this int is consistent whith Kmer compartor
+ */
+ @Override
+ public int normalize(byte[] bytes, int start, int length) {
+ return KmerPointable.getIntReverse(bytes, start, length);
+ }
+ };
+ }
}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/primitive/KmerPointable.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/primitive/KmerPointable.java
index ae07355..b4aa4c7 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/primitive/KmerPointable.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/primitive/KmerPointable.java
@@ -9,130 +9,122 @@
import edu.uci.ics.hyracks.data.std.api.IPointable;
import edu.uci.ics.hyracks.data.std.api.IPointableFactory;
-public final class KmerPointable extends AbstractPointable implements
- IHashable, IComparable, INumeric {
- public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
- private static final long serialVersionUID = 1L;
+public final class KmerPointable extends AbstractPointable implements IHashable, IComparable, INumeric {
+ public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
+ private static final long serialVersionUID = 1L;
- @Override
- public boolean isFixedLength() {
- return false;
- }
+ @Override
+ public boolean isFixedLength() {
+ return false;
+ }
- @Override
- public int getFixedLength() {
- return -1;
- }
- };
+ @Override
+ public int getFixedLength() {
+ return -1;
+ }
+ };
- public static final IPointableFactory FACTORY = new IPointableFactory() {
- private static final long serialVersionUID = 1L;
+ public static final IPointableFactory FACTORY = new IPointableFactory() {
+ private static final long serialVersionUID = 1L;
- @Override
- public IPointable createPointable() {
- return new KmerPointable();
- }
+ @Override
+ public IPointable createPointable() {
+ return new KmerPointable();
+ }
- @Override
- public ITypeTraits getTypeTraits() {
- return TYPE_TRAITS;
- }
- };
+ @Override
+ public ITypeTraits getTypeTraits() {
+ return TYPE_TRAITS;
+ }
+ };
- public static short getShortReverse(byte[] bytes, int offset, int length) {
- if (length < 2) {
- return (short) (bytes[offset] & 0xff);
- }
- return (short) (((bytes[offset + length - 1] & 0xff) << 8) + (bytes[offset
- + length - 2] & 0xff));
- }
+ public static short getShortReverse(byte[] bytes, int offset, int length) {
+ if (length < 2) {
+ return (short) (bytes[offset] & 0xff);
+ }
+ return (short) (((bytes[offset + length - 1] & 0xff) << 8) + (bytes[offset + length - 2] & 0xff));
+ }
- public static int getIntReverse(byte[] bytes, int offset, int length) {
- int shortValue = getShortReverse(bytes, offset, length) & 0xffff;
+ public static int getIntReverse(byte[] bytes, int offset, int length) {
+ int shortValue = getShortReverse(bytes, offset, length) & 0xffff;
- if (length < 3) {
- return shortValue;
- }
- if (length == 3) {
- return (((bytes[offset + 2] & 0xff) << 16)
- + ((bytes[offset + 1] & 0xff) << 8) + ((bytes[offset] & 0xff)));
- }
- return ((bytes[offset + length - 1] & 0xff) << 24)
- + ((bytes[offset + length - 2] & 0xff) << 16)
- + ((bytes[offset + length - 3] & 0xff) << 8)
- + ((bytes[offset + length - 4] & 0xff) << 0);
- }
+ if (length < 3) {
+ return shortValue;
+ }
+ if (length == 3) {
+ return (((bytes[offset + 2] & 0xff) << 16) + ((bytes[offset + 1] & 0xff) << 8) + ((bytes[offset] & 0xff)));
+ }
+ return ((bytes[offset + length - 1] & 0xff) << 24) + ((bytes[offset + length - 2] & 0xff) << 16)
+ + ((bytes[offset + length - 3] & 0xff) << 8) + ((bytes[offset + length - 4] & 0xff) << 0);
+ }
- public static long getLongReverse(byte[] bytes, int offset, int length) {
- if (length < 8) {
- return ((long) getIntReverse(bytes, offset, length)) & 0x0ffffffffL;
- }
- return (((long) (bytes[offset + length - 1] & 0xff)) << 56)
- + (((long) (bytes[offset + length - 2] & 0xff)) << 48)
- + (((long) (bytes[offset + length - 3] & 0xff)) << 40)
- + (((long) (bytes[offset + length - 4] & 0xff)) << 32)
- + (((long) (bytes[offset + length - 5] & 0xff)) << 24)
- + (((long) (bytes[offset + length - 6] & 0xff)) << 16)
- + (((long) (bytes[offset + length - 7] & 0xff)) << 8)
- + (((long) (bytes[offset + length - 8] & 0xff)));
- }
+ public static long getLongReverse(byte[] bytes, int offset, int length) {
+ if (length < 8) {
+ return ((long) getIntReverse(bytes, offset, length)) & 0x0ffffffffL;
+ }
+ return (((long) (bytes[offset + length - 1] & 0xff)) << 56)
+ + (((long) (bytes[offset + length - 2] & 0xff)) << 48)
+ + (((long) (bytes[offset + length - 3] & 0xff)) << 40)
+ + (((long) (bytes[offset + length - 4] & 0xff)) << 32)
+ + (((long) (bytes[offset + length - 5] & 0xff)) << 24)
+ + (((long) (bytes[offset + length - 6] & 0xff)) << 16)
+ + (((long) (bytes[offset + length - 7] & 0xff)) << 8) + (((long) (bytes[offset + length - 8] & 0xff)));
+ }
- @Override
- public int compareTo(IPointable pointer) {
- return compareTo(pointer.getByteArray(), pointer.getStartOffset(),
- pointer.getLength());
- }
+ @Override
+ public int compareTo(IPointable pointer) {
+ return compareTo(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
+ }
- @Override
- public int compareTo(byte[] bytes, int offset, int length) {
+ @Override
+ public int compareTo(byte[] bytes, int offset, int length) {
- if (this.length != length) {
- return this.length - length;
- }
- for (int i = length - 1; i >= 0; i--) {
- int cmp = (this.bytes[this.start + i] & 0xff) - (bytes[offset + i] & 0xff);
- if (cmp !=0){
- return cmp;
- }
- }
+ if (this.length != length) {
+ return this.length - length;
+ }
+ for (int i = length - 1; i >= 0; i--) {
+ int cmp = (this.bytes[this.start + i] & 0xff) - (bytes[offset + i] & 0xff);
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
- return 0;
- }
+ return 0;
+ }
- @Override
- public int hash() {
- int hash = KmerHashPartitioncomputerFactory.hashBytes(bytes, start,
- length);
- return hash;
- }
+ @Override
+ public int hash() {
+ int hash = KmerHashPartitioncomputerFactory.hashBytes(bytes, start, length);
+ return hash;
+ }
- @Override
- public byte byteValue() {
- return bytes[start + length - 1];
- }
+ @Override
+ public byte byteValue() {
+ return bytes[start + length - 1];
+ }
- @Override
- public short shortValue() {
- return getShortReverse(bytes, start, length);
- }
+ @Override
+ public short shortValue() {
+ return getShortReverse(bytes, start, length);
+ }
- @Override
- public int intValue() {
- return getIntReverse(bytes, start, length);
- }
+ @Override
+ public int intValue() {
+ return getIntReverse(bytes, start, length);
+ }
- @Override
- public long longValue() {
- return getLongReverse(bytes, start, length);
- }
+ @Override
+ public long longValue() {
+ return getLongReverse(bytes, start, length);
+ }
- @Override
- public float floatValue() {
- return Float.intBitsToFloat(intValue());
- }
+ @Override
+ public float floatValue() {
+ return Float.intBitsToFloat(intValue());
+ }
- @Override
- public double doubleValue() {
- return Double.longBitsToDouble(longValue());
- }
+ @Override
+ public double doubleValue() {
+ return Double.longBitsToDouble(longValue());
+ }
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
index 8c3f277..46e162c 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
@@ -4,12 +4,13 @@
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.mapred.JobConf;
+import edu.uci.ics.genomix.job.GenomixJob;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -23,9 +24,11 @@
private static final long serialVersionUID = 1L;
private ConfFactory confFactory;
+ private final int kmerlength;
public KMerSequenceWriterFactory(JobConf conf) throws HyracksDataException {
this.confFactory = new ConfFactory(conf);
+ this.kmerlength = conf.getInt(GenomixJob.KMER_LENGTH, GenomixJob.DEFAULT_KMER);
}
public class TupleWriter implements ITupleWriter {
@@ -37,7 +40,7 @@
Writer writer = null;
KmerCountValue reEnterCount = new KmerCountValue();
- BytesWritable reEnterKey = new BytesWritable();
+ KmerBytesWritable reEnterKey = new KmerBytesWritable(kmerlength);
/**
* assumption is that output never change source!
@@ -66,7 +69,7 @@
public void open(DataOutput output) throws HyracksDataException {
try {
writer = SequenceFile.createWriter(cf.getConf(),
- (FSDataOutputStream) output, BytesWritable.class,
+ (FSDataOutputStream) output, KmerBytesWritable.class,
KmerCountValue.class, CompressionType.NONE, null);
} catch (IOException e) {
throw new HyracksDataException(e);
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
index 0975fd2..4ba38d0 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
@@ -3,7 +3,8 @@
import java.io.DataOutput;
import java.io.IOException;
-import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
@@ -12,53 +13,48 @@
public class KMerTextWriterFactory implements ITupleWriterFactory {
- /**
+ /**
*
*/
- private static final long serialVersionUID = 1L;
- private final int KMER;
+ private static final long serialVersionUID = 1L;
+ private KmerBytesWritable kmer;
- public KMerTextWriterFactory(int kmer) {
- KMER = kmer;
- }
+ public KMerTextWriterFactory(int k) {
+ kmer = new KmerBytesWritable(k);
+ }
- public class TupleWriter implements ITupleWriter {
- @Override
- public void write(DataOutput output, ITupleReference tuple)
- throws HyracksDataException {
- try {
- output.write(Kmer.recoverKmerFrom(KMER,
- tuple.getFieldData(0), tuple.getFieldStart(0),
- tuple.getFieldLength(0)).getBytes());
- output.writeByte('\t');
- output.write(Kmer.GENE_CODE.getSymbolFromBitMap(tuple
- .getFieldData(1)[tuple.getFieldStart(1)]).getBytes());
- output.writeByte('\t');
- output.write(String.valueOf((int)tuple
- .getFieldData(2)[tuple.getFieldStart(2)]).getBytes());
- output.writeByte('\n');
- } catch (IOException e) {
- throw new HyracksDataException(e);
- }
- }
+ public class TupleWriter implements ITupleWriter {
+ @Override
+ public void write(DataOutput output, ITupleReference tuple) throws HyracksDataException {
+ try {
+ kmer.set(tuple.getFieldData(0), tuple.getFieldStart(0), tuple.getFieldLength(0));
+ output.write(kmer.toString().getBytes());
+ output.writeByte('\t');
+ output.write(GeneCode.getSymbolFromBitMap(tuple.getFieldData(1)[tuple.getFieldStart(1)]).getBytes());
+ output.writeByte('\t');
+ output.write(String.valueOf((int) tuple.getFieldData(2)[tuple.getFieldStart(2)]).getBytes());
+ output.writeByte('\n');
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
- @Override
- public void open(DataOutput output) throws HyracksDataException {
- // TODO Auto-generated method stub
-
- }
+ @Override
+ public void open(DataOutput output) throws HyracksDataException {
+ // TODO Auto-generated method stub
- @Override
- public void close(DataOutput output) throws HyracksDataException {
- // TODO Auto-generated method stub
- }
- }
+ }
- @Override
- public ITupleWriter getTupleWriter(IHyracksTaskContext ctx)
- throws HyracksDataException {
- // TODO Auto-generated method stub
- return new TupleWriter();
- }
+ @Override
+ public void close(DataOutput output) throws HyracksDataException {
+ // TODO Auto-generated method stub
+ }
+ }
+
+ @Override
+ public ITupleWriter getTupleWriter(IHyracksTaskContext ctx) throws HyracksDataException {
+ // TODO Auto-generated method stub
+ return new TupleWriter();
+ }
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
index e5b7fa9..d4b88ba 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
@@ -8,8 +8,8 @@
import org.apache.hadoop.io.Text;
import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
-import edu.uci.ics.genomix.type.Kmer;
-import edu.uci.ics.genomix.type.Kmer.GENE_CODE;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.hyracks.api.comm.IFrameWriter;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -19,121 +19,124 @@
import edu.uci.ics.hyracks.hdfs.api.IKeyValueParser;
import edu.uci.ics.hyracks.hdfs.api.IKeyValueParserFactory;
-public class ReadsKeyValueParserFactory implements
- IKeyValueParserFactory<LongWritable, Text> {
- private static final long serialVersionUID = 1L;
+;
- private int k;
- private int byteNum;
- private boolean bReversed;
+public class ReadsKeyValueParserFactory implements IKeyValueParserFactory<LongWritable, Text> {
+ private static final long serialVersionUID = 1L;
- public ReadsKeyValueParserFactory(int k, boolean bGenerateReversed) {
- this.k = k;
- byteNum = (byte) Math.ceil((double) k / 4.0);
- bReversed = bGenerateReversed;
- }
+ private KmerBytesWritable kmer;
+ private boolean bReversed;
- @Override
- public IKeyValueParser<LongWritable, Text> createKeyValueParser(
- final IHyracksTaskContext ctx) {
- final ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(2);
- final ByteBuffer outputBuffer = ctx.allocateFrame();
- final FrameTupleAppender outputAppender = new FrameTupleAppender(
- ctx.getFrameSize());
- outputAppender.reset(outputBuffer, true);
+ public ReadsKeyValueParserFactory(int k, boolean bGenerateReversed) {
+ bReversed = bGenerateReversed;
+ kmer = new KmerBytesWritable(k);
+ }
- return new IKeyValueParser<LongWritable, Text>() {
+ @Override
+ public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) {
+ final ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(2);
+ final ByteBuffer outputBuffer = ctx.allocateFrame();
+ final FrameTupleAppender outputAppender = new FrameTupleAppender(ctx.getFrameSize());
+ outputAppender.reset(outputBuffer, true);
- @Override
- public void parse(LongWritable key, Text value, IFrameWriter writer)
- throws HyracksDataException {
- String geneLine = value.toString(); // Read the Real Gene Line
- Pattern genePattern = Pattern.compile("[AGCT]+");
- Matcher geneMatcher = genePattern.matcher(geneLine);
- boolean isValid = geneMatcher.matches();
- if (isValid) {
- SplitReads(geneLine.getBytes(), writer);
- }
- }
+ return new IKeyValueParser<LongWritable, Text>() {
- private void SplitReads(byte[] array, IFrameWriter writer) {
- /** first kmer */
- byte[] kmer = Kmer.compressKmer(k, array, 0);
- byte pre = 0;
- byte next = GENE_CODE.getAdjBit(array[k]);
- InsertToFrame(kmer, pre, next, writer);
+ @Override
+ public void parse(LongWritable key, Text value, IFrameWriter writer) throws HyracksDataException {
+ String geneLine = value.toString(); // Read the Real Gene Line
+ Pattern genePattern = Pattern.compile("[AGCT]+");
+ Matcher geneMatcher = genePattern.matcher(geneLine);
+ boolean isValid = geneMatcher.matches();
+ if (isValid) {
+ SplitReads(geneLine.getBytes(), writer);
+ }
+ }
- /** middle kmer */
- for (int i = k; i < array.length - 1; i++) {
- pre = Kmer.moveKmer(k, kmer, array[i]);
- next = GENE_CODE.getAdjBit(array[i + 1]);
- InsertToFrame(kmer, pre, next, writer);
+ private void SplitReads(byte[] array, IFrameWriter writer) {
+ /** first kmer */
+ int k = kmer.getKmerLength();
+ kmer.setByRead(array, 0);
+ byte pre = 0;
+ byte next = GeneCode.getAdjBit(array[k]);
+ InsertToFrame(kmer, pre, next, writer);
- }
- /** last kmer */
- pre = Kmer.moveKmer(k, kmer, array[array.length - 1]);
- next = 0;
- InsertToFrame(kmer, pre, next, writer);
+ /** middle kmer */
+ for (int i = k; i < array.length - 1; i++) {
+ pre = kmer.shiftKmerWithNextChar(array[i]);
+ next = GeneCode.getAdjBit(array[i + 1]);
+ InsertToFrame(kmer, pre, next, writer);
+ }
- if (bReversed) {
- /** first kmer */
- kmer = Kmer.compressKmerReverse(k, array, 0);
- next = 0;
- pre = GENE_CODE.getAdjBit(array[k]);
- InsertToFrame(kmer, pre, next, writer);
- /** middle kmer */
- for (int i = k; i < array.length - 1; i++) {
- next = Kmer.moveKmerReverse(k, kmer, array[i]);
- pre = GENE_CODE.getAdjBit(array[i + 1]);
- InsertToFrame(kmer, pre, next, writer);
- }
- /** last kmer */
- next = Kmer.moveKmerReverse(k, kmer,
- array[array.length - 1]);
- pre = 0;
- InsertToFrame(kmer, pre, next, writer);
- }
- }
+ /** last kmer */
+ pre = kmer.shiftKmerWithNextChar(array[array.length - 1]);
+ next = 0;
+ InsertToFrame(kmer, pre, next, writer);
- private void InsertToFrame(byte[] kmer, byte pre, byte next,
- IFrameWriter writer) {
- try {
- byte adj = GENE_CODE.mergePreNextAdj(pre, next);
- tupleBuilder.reset();
- tupleBuilder.addField(kmer, 0, byteNum);
- tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE,
- adj);
+ if (bReversed) {
+ /** first kmer */
+ kmer.setByRead(array, 0);
+ next = 0;
+ pre = GeneCode.getAdjBit(array[k]);
+ InsertToFrame(kmer, pre, next, writer);
+ /** middle kmer */
+ for (int i = k; i < array.length - 1; i++) {
+ next = kmer.shiftKmerWithPreChar(array[i]);
+ pre = GeneCode.getAdjBit(array[i + 1]);
+ InsertToFrame(kmer, pre, next, writer);
+ }
+ /** last kmer */
+ next = kmer.shiftKmerWithPreChar(array[array.length - 1]);
+ pre = 0;
+ InsertToFrame(kmer, pre, next, writer);
+ }
+ }
- if (!outputAppender.append(
- tupleBuilder.getFieldEndOffsets(),
- tupleBuilder.getByteArray(), 0,
- tupleBuilder.getSize())) {
- FrameUtils.flushFrame(outputBuffer, writer);
- outputAppender.reset(outputBuffer, true);
- if (!outputAppender.append(
- tupleBuilder.getFieldEndOffsets(),
- tupleBuilder.getByteArray(), 0,
- tupleBuilder.getSize())) {
- throw new IllegalStateException(
- "Failed to copy an record into a frame: the record size is too large.");
- }
- }
- } catch (Exception e) {
- throw new IllegalStateException(e);
- }
- }
+ /**
+ * At this graph building phase, we assume the kmer length are all
+ * the same Thus we didn't output those Kmer length
+ *
+ * @param kmer
+ * :input kmer
+ * @param pre
+ * : pre neighbor code
+ * @param next
+ * : next neighbor code
+ * @param writer
+ * : output writer
+ */
+ private void InsertToFrame(KmerBytesWritable kmer, byte pre, byte next, IFrameWriter writer) {
+ try {
+ byte adj = GeneCode.mergePreNextAdj(pre, next);
+ tupleBuilder.reset();
+ tupleBuilder.addField(kmer.getBytes(), 0, kmer.getLength());
+ tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE, adj);
- @Override
- public void open(IFrameWriter writer) throws HyracksDataException {
- // TODO Auto-generated method stub
+ if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
+ tupleBuilder.getSize())) {
+ FrameUtils.flushFrame(outputBuffer, writer);
+ outputAppender.reset(outputBuffer, true);
+ if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
+ tupleBuilder.getSize())) {
+ throw new IllegalStateException(
+ "Failed to copy an record into a frame: the record size is too large.");
+ }
+ }
+ } catch (Exception e) {
+ throw new IllegalStateException(e);
+ }
+ }
- }
+ @Override
+ public void open(IFrameWriter writer) throws HyracksDataException {
+ // TODO Auto-generated method stub
- @Override
- public void close(IFrameWriter writer) throws HyracksDataException {
- FrameUtils.flushFrame(outputBuffer, writer);
- }
- };
- }
+ }
+
+ @Override
+ public void close(IFrameWriter writer) throws HyracksDataException {
+ FrameUtils.flushFrame(outputBuffer, writer);
+ }
+ };
+ }
}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
index 0751707..11786f3 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
@@ -31,7 +31,7 @@
public static final String GROUPBY_HYBRID_RECORDSIZE_CROSS = "genomix.graph.groupby.hybrid.recordsize.cross";
public static final String GROUPBY_HYBRID_HASHLEVEL = "genomix.graph.groupby.hybrid.hashlevel";
- public static final int DEFAULT_KMER = 55;
+ public static final int DEFAULT_KMER = 21;
public static final int DEFAULT_FRAME_SIZE = 32768;
public static final int DEFAULT_FRAME_LIMIT = 4096;
public static final int DEFAULT_TABLE_SIZE = 10485767;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
index 683c0a1..9462fb0 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
@@ -280,6 +280,10 @@
protected void initJobConfiguration() {
kmers = conf.getInt(GenomixJob.KMER_LENGTH, GenomixJob.DEFAULT_KMER);
+ if (kmers % 2 == 0){
+ kmers--;
+ conf.setInt(GenomixJob.KMER_LENGTH, kmers);
+ }
frameLimits = conf.getInt(GenomixJob.FRAME_LIMIT,
GenomixJob.DEFAULT_FRAME_LIMIT);
tableSize = conf.getInt(GenomixJob.TABLE_SIZE,
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatReadsKeyValueParserFactory.java
index 4c2205f..561d64a 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatReadsKeyValueParserFactory.java
@@ -5,6 +5,7 @@
import org.apache.hadoop.io.BytesWritable;
import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
+import edu.uci.ics.genomix.type.GeneCode;
import edu.uci.ics.genomix.type.KmerCountValue;
import edu.uci.ics.genomix.type.KmerUtil;
import edu.uci.ics.hyracks.api.comm.IFrameWriter;
@@ -46,7 +47,7 @@
IFrameWriter writer) throws HyracksDataException {
byte adjMap = value.getAdjBitMap();
byte count = value.getCount();
- InsertToFrame((byte) (KmerUtil.inDegree(adjMap)*10+KmerUtil.outDegree(adjMap)),count,writer);
+ InsertToFrame((byte) (GeneCode.inDegree(adjMap)*10+GeneCode.outDegree(adjMap)),count,writer);
}
@Override
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
index ba0e5b9..3eb4347 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
@@ -28,221 +28,204 @@
import edu.uci.ics.genomix.driver.Driver;
import edu.uci.ics.genomix.driver.Driver.Plan;
import edu.uci.ics.genomix.job.GenomixJob;
-import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
-import edu.uci.ics.genomix.example.jobrun.TestUtils;;
public class JobRunTest {
- private static final String ACTUAL_RESULT_DIR = "actual";
- private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf";
+ private static final String ACTUAL_RESULT_DIR = "actual";
+ private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf";
- private static final String DATA_PATH = "src/test/resources/data/webmap/text.txt";
- private static final String HDFS_INPUT_PATH = "/webmap";
- private static final String HDFS_OUTPUT_PATH = "/webmap_result";
+ private static final String DATA_PATH = "src/test/resources/data/webmap/text.txt";
+ private static final String HDFS_INPUT_PATH = "/webmap";
+ private static final String HDFS_OUTPUT_PATH = "/webmap_result";
- private static final String DUMPED_RESULT = ACTUAL_RESULT_DIR
- + HDFS_OUTPUT_PATH + "/merged.txt";
- private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
- private static final String EXPECTED_PATH = "src/test/resources/expected/result2";
- private static final String EXPECTED_REVERSE_PATH = "src/test/resources/expected/result_reverse";
+ private static final String DUMPED_RESULT = ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH + "/merged.txt";
+ private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
+ private static final String EXPECTED_PATH = "src/test/resources/expected/result2";
+ private static final String EXPECTED_REVERSE_PATH = "src/test/resources/expected/result_reverse";
- private static final String HYRACKS_APP_NAME = "genomix";
- private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR
- + File.separator + "conf.xml";
- private MiniDFSCluster dfsCluster;
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private MiniDFSCluster dfsCluster;
- private JobConf conf = new JobConf();
- private int numberOfNC = 2;
- private int numPartitionPerMachine = 1;
+ private JobConf conf = new JobConf();
+ private int numberOfNC = 2;
+ private int numPartitionPerMachine = 1;
- private Driver driver;
+ private Driver driver;
- @Before
- public void setUp() throws Exception {
- cleanupStores();
- edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.init();
- FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
- FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
- startHDFS();
+ @Before
+ public void setUp() throws Exception {
+ cleanupStores();
+ edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.init();
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHDFS();
- FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
- FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
+ FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
+ FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
- conf.setInt(GenomixJob.KMER_LENGTH, 5);
- driver = new Driver(edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.CC_HOST,
- edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT,
- numPartitionPerMachine);
- }
+ conf.setInt(GenomixJob.KMER_LENGTH, 5);
+ driver = new Driver(edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.CC_HOST,
+ edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT, numPartitionPerMachine);
+ }
- private void cleanupStores() throws IOException {
- FileUtils.forceMkdir(new File("teststore"));
- FileUtils.forceMkdir(new File("build"));
- FileUtils.cleanDirectory(new File("teststore"));
- FileUtils.cleanDirectory(new File("build"));
- }
+ private void cleanupStores() throws IOException {
+ FileUtils.forceMkdir(new File("teststore"));
+ FileUtils.forceMkdir(new File("build"));
+ FileUtils.cleanDirectory(new File("teststore"));
+ FileUtils.cleanDirectory(new File("build"));
+ }
- private void startHDFS() throws IOException {
- conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
- conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
- conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
+ private void startHDFS() throws IOException {
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
- FileSystem lfs = FileSystem.getLocal(new Configuration());
- lfs.delete(new Path("build"), true);
- System.setProperty("hadoop.log.dir", "logs");
- dfsCluster = new MiniDFSCluster(conf, numberOfNC, true, null);
- FileSystem dfs = FileSystem.get(conf);
- Path src = new Path(DATA_PATH);
- Path dest = new Path(HDFS_INPUT_PATH);
- Path result = new Path(HDFS_OUTPUT_PATH);
- dfs.mkdirs(dest);
- //dfs.mkdirs(result);
- dfs.copyFromLocalFile(src, dest);
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, numberOfNC, true, null);
+ FileSystem dfs = FileSystem.get(conf);
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_INPUT_PATH);
+ dfs.mkdirs(dest);
+ //dfs.mkdirs(result);
+ dfs.copyFromLocalFile(src, dest);
- DataOutputStream confOutput = new DataOutputStream(
- new FileOutputStream(new File(HADOOP_CONF_PATH)));
- conf.writeXml(confOutput);
- confOutput.flush();
- confOutput.close();
- }
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
- private void cleanUpReEntry() throws IOException {
- FileSystem lfs = FileSystem.getLocal(new Configuration());
- if (lfs.exists(new Path(DUMPED_RESULT))) {
- lfs.delete(new Path(DUMPED_RESULT), true);
- }
- FileSystem dfs = FileSystem.get(conf);
- if (dfs.exists(new Path(HDFS_OUTPUT_PATH))) {
- dfs.delete(new Path(HDFS_OUTPUT_PATH), true);
- }
- }
+ private void cleanUpReEntry() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ if (lfs.exists(new Path(DUMPED_RESULT))) {
+ lfs.delete(new Path(DUMPED_RESULT), true);
+ }
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(new Path(HDFS_OUTPUT_PATH))) {
+ dfs.delete(new Path(HDFS_OUTPUT_PATH), true);
+ }
+ }
- @Test
- public void TestAll() throws Exception{
- cleanUpReEntry();
- TestExternalGroupby();
- cleanUpReEntry();
- TestPreClusterGroupby();
- cleanUpReEntry();
- TestHybridGroupby();
- cleanUpReEntry();
- conf.setBoolean(GenomixJob.REVERSED_KMER, true);
- TestExternalReversedGroupby();
- cleanUpReEntry();
- TestPreClusterReversedGroupby();
- cleanUpReEntry();
- TestHybridReversedGroupby();
- }
-
- public void TestExternalGroupby() throws Exception {
- conf.set(GenomixJob.GROUPBY_TYPE, "external");
- System.err.println("Testing ExternalGroupBy");
- driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults(EXPECTED_PATH));
- }
+ @Test
+ public void TestAll() throws Exception {
+ cleanUpReEntry();
+ TestExternalGroupby();
+ cleanUpReEntry();
+ TestPreClusterGroupby();
+ cleanUpReEntry();
+ TestHybridGroupby();
+ cleanUpReEntry();
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ TestExternalReversedGroupby();
+ cleanUpReEntry();
+ TestPreClusterReversedGroupby();
+ cleanUpReEntry();
+ TestHybridReversedGroupby();
+ }
- public void TestPreClusterGroupby() throws Exception {
- conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
- System.err.println("Testing PreClusterGroupBy");
- driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults(EXPECTED_PATH));
- }
+ public void TestExternalGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "external");
+ System.err.println("Testing ExternalGroupBy");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
- public void TestHybridGroupby() throws Exception {
- conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
- System.err.println("Testing HybridGroupBy");
- driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults(EXPECTED_PATH));
- }
-
- public void TestExternalReversedGroupby() throws Exception{
- conf.set(GenomixJob.GROUPBY_TYPE, "external");
- conf.setBoolean(GenomixJob.REVERSED_KMER, true);
- System.err.println("Testing ExternalGroupBy + Reversed");
- driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
- }
- public void TestPreClusterReversedGroupby() throws Exception{
- conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
- conf.setBoolean(GenomixJob.REVERSED_KMER, true);
- System.err.println("Testing PreclusterGroupBy + Reversed");
- driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
- }
- public void TestHybridReversedGroupby() throws Exception{
- conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
- conf.setBoolean(GenomixJob.REVERSED_KMER, true);
- System.err.println("Testing HybridGroupBy + Reversed");
- driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
- Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
- }
+ public void TestPreClusterGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+ System.err.println("Testing PreClusterGroupBy");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
- private boolean checkResults(String expectedPath) throws Exception {
- File dumped = null;
- String format = conf.get(GenomixJob.OUTPUT_FORMAT);
- if ("text".equalsIgnoreCase(format)) {
- FileUtil.copyMerge(FileSystem.get(conf),
- new Path(HDFS_OUTPUT_PATH), FileSystem
- .getLocal(new Configuration()), new Path(
- DUMPED_RESULT), false, conf, null);
- dumped = new File(DUMPED_RESULT);
- } else {
-
- FileSystem.getLocal(new Configuration()).mkdirs(new Path(ACTUAL_RESULT_DIR
- + HDFS_OUTPUT_PATH));
- File filePathTo = new File(CONVERT_RESULT);
- BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
- for (int i = 0; i < numPartitionPerMachine * numberOfNC; i++) {
- String partname = "/part-" + i;
-// FileUtil.copy(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH
-// + partname), FileSystem.getLocal(new Configuration()),
-// new Path(ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH + partname), false, conf);
-
-
- Path path = new Path(HDFS_OUTPUT_PATH
- + partname);
- FileSystem dfs = FileSystem.get(conf);
- if (dfs.getFileStatus(path).getLen() == 0){
- continue;
- }
- SequenceFile.Reader reader = new SequenceFile.Reader(dfs, path,
- conf);
- BytesWritable key = (BytesWritable) ReflectionUtils
- .newInstance(reader.getKeyClass(), conf);
- KmerCountValue value = (KmerCountValue) ReflectionUtils
- .newInstance(reader.getValueClass(), conf);
+ public void TestHybridGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+ System.err.println("Testing HybridGroupBy");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
- int k = conf.getInt(GenomixJob.KMER_LENGTH, 25);
- while (reader.next(key, value)) {
- if (key == null || value == null){
- break;
- }
- bw.write(Kmer.recoverKmerFrom(k, key.getBytes(), 0,
- key.getLength())
- + "\t" + value.toString());
- System.out.println(Kmer.recoverKmerFrom(k, key.getBytes(), 0,
- key.getLength())
- + "\t" + value.toString());
- bw.newLine();
- }
- reader.close();
+ public void TestExternalReversedGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "external");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing ExternalGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
- }
- bw.close();
- dumped = new File(CONVERT_RESULT);
- }
+ public void TestPreClusterReversedGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing PreclusterGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
- TestUtils.compareWithSortedResult(new File(expectedPath), dumped);
- return true;
- }
+ public void TestHybridReversedGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing HybridGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
- @After
- public void tearDown() throws Exception {
- edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.deinit();
- cleanupHDFS();
- }
+ private boolean checkResults(String expectedPath) throws Exception {
+ File dumped = null;
+ String format = conf.get(GenomixJob.OUTPUT_FORMAT);
+ if ("text".equalsIgnoreCase(format)) {
+ FileUtil.copyMerge(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH),
+ FileSystem.getLocal(new Configuration()), new Path(DUMPED_RESULT), false, conf, null);
+ dumped = new File(DUMPED_RESULT);
+ } else {
- private void cleanupHDFS() throws Exception {
- dfsCluster.shutdown();
- }
+ FileSystem.getLocal(new Configuration()).mkdirs(new Path(ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH));
+ File filePathTo = new File(CONVERT_RESULT);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+ for (int i = 0; i < numPartitionPerMachine * numberOfNC; i++) {
+ String partname = "/part-" + i;
+ // FileUtil.copy(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH
+ // + partname), FileSystem.getLocal(new Configuration()),
+ // new Path(ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH + partname), false, conf);
+
+ Path path = new Path(HDFS_OUTPUT_PATH + partname);
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.getFileStatus(path).getLen() == 0) {
+ continue;
+ }
+ SequenceFile.Reader reader = new SequenceFile.Reader(dfs, path, conf);
+
+ KmerBytesWritable key = (KmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+
+ int k = conf.getInt(GenomixJob.KMER_LENGTH, 25);
+ while (reader.next(key, value)) {
+ if (key == null || value == null) {
+ break;
+ }
+ bw.write(key.toString() + "\t" + value.toString());
+ System.out.println(key.toString() + "\t" + value.toString());
+ bw.newLine();
+ }
+ reader.close();
+ }
+ bw.close();
+ dumped = new File(CONVERT_RESULT);
+ }
+
+ TestUtils.compareWithSortedResult(new File(expectedPath), dumped);
+ return true;
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.deinit();
+ cleanupHDFS();
+ }
+
+ private void cleanupHDFS() throws Exception {
+ dfsCluster.shutdown();
+ }
}