Merge branch 'fullstack_genomix' of https://code.google.com/p/hyracks into fullstack_genomix
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java
new file mode 100644
index 0000000..ef5a62b
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java
@@ -0,0 +1,126 @@
+package edu.uci.ics.genomix.type;
+
+public class GeneCode {
+ public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
+ /**
+ * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL
+ * }
+ */
+ public static final byte A = 0;
+ public static final byte C = 1;
+ public static final byte G = 2;
+ public static final byte T = 3;
+
+ public static byte getCodeFromSymbol(byte ch) {
+ byte r = 0;
+ switch (ch) {
+ case 'A':
+ case 'a':
+ r = A;
+ break;
+ case 'C':
+ case 'c':
+ r = C;
+ break;
+ case 'G':
+ case 'g':
+ r = G;
+ break;
+ case 'T':
+ case 't':
+ r = T;
+ break;
+ }
+ return r;
+ }
+
+ public static byte getSymbolFromCode(byte code) {
+ if (code > 3) {
+ return '!';
+ }
+ return GENE_SYMBOL[code];
+ }
+
+ public static byte getAdjBit(byte t) {
+ byte r = 0;
+ switch (t) {
+ case 'A':
+ case 'a':
+ r = 1 << A;
+ break;
+ case 'C':
+ case 'c':
+ r = 1 << C;
+ break;
+ case 'G':
+ case 'g':
+ r = 1 << G;
+ break;
+ case 'T':
+ case 't':
+ r = 1 << T;
+ break;
+ }
+ return r;
+ }
+
+ /**
+ * It works for path merge. Merge the kmer by his next, we need to make sure
+ * the @{t} is a single neighbor.
+ *
+ * @param t
+ * the neighbor code in BitMap
+ * @return the genecode
+ */
+ public static byte getGeneCodeFromBitMap(byte t) {
+ switch (t) {
+ case 1 << A:
+ return A;
+ case 1 << C:
+ return C;
+ case 1 << G:
+ return G;
+ case 1 << T:
+ return T;
+ }
+ return -1;
+ }
+
+ public static int countNumberOfBitSet(int i) {
+ int c = 0;
+ for (; i != 0; c++) {
+ i &= i - 1;
+ }
+ return c;
+ }
+
+ public static int inDegree(byte bitmap) {
+ return countNumberOfBitSet((bitmap >> 4) & 0x0f);
+ }
+
+ public static int outDegree(byte bitmap) {
+ return countNumberOfBitSet(bitmap & 0x0f);
+ }
+
+ public static byte mergePreNextAdj(byte pre, byte next) {
+ return (byte) (pre << 4 | (next & 0x0f));
+ }
+
+ public static String getSymbolFromBitMap(byte code) {
+ int left = (code >> 4) & 0x0F;
+ int right = code & 0x0F;
+ StringBuilder str = new StringBuilder();
+ for (int i = A; i <= T; i++) {
+ if ((left & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ str.append('|');
+ for (int i = A; i <= T; i++) {
+ if ((right & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ return str.toString();
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
deleted file mode 100644
index 21fee2b..0000000
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Copyright 2009-2012 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package edu.uci.ics.genomix.type;
-
-public class Kmer {
-
- public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
-
- public final static class GENE_CODE {
-
- /**
- * make sure this 4 ids equal to the sequence id of char in
- * {@GENE_SYMBOL}
- */
- public static final byte A = 0;
- public static final byte C = 1;
- public static final byte G = 2;
- public static final byte T = 3;
-
- public static byte getCodeFromSymbol(byte ch) {
- byte r = 0;
- switch (ch) {
- case 'A':
- case 'a':
- r = A;
- break;
- case 'C':
- case 'c':
- r = C;
- break;
- case 'G':
- case 'g':
- r = G;
- break;
- case 'T':
- case 't':
- r = T;
- break;
- }
- return r;
- }
-
- public static byte getSymbolFromCode(byte code) {
- if (code > 3) {
- return '!';
- }
- return GENE_SYMBOL[code];
- }
-
- public static byte getAdjBit(byte t) {
- byte r = 0;
- switch (t) {
- case 'A':
- case 'a':
- r = 1 << A;
- break;
- case 'C':
- case 'c':
- r = 1 << C;
- break;
- case 'G':
- case 'g':
- r = 1 << G;
- break;
- case 'T':
- case 't':
- r = 1 << T;
- break;
- }
- return r;
- }
-
- /**
- * It works for path merge.
- * Merge the kmer by his next, we need to make sure the @{t} is a single neighbor.
- * @param t the neighbor code in BitMap
- * @return the genecode
- */
- public static byte getGeneCodeFromBitMap(byte t) {
- switch (t) {
- case 1 << A:
- return A;
- case 1 << C:
- return C;
- case 1 << G:
- return G;
- case 1 << T:
- return T;
- }
- return -1;
- }
-
- public static byte mergePreNextAdj(byte pre, byte next) {
- return (byte) (pre << 4 | (next & 0x0f));
- }
-
- public static String getSymbolFromBitMap(byte code) {
- int left = (code >> 4) & 0x0F;
- int right = code & 0x0F;
- StringBuilder str = new StringBuilder();
- for (int i = A; i <= T; i++) {
- if ((left & (1 << i)) != 0) {
- str.append((char) GENE_SYMBOL[i]);
- }
- }
- str.append('|');
- for (int i = A; i <= T; i++) {
- if ((right & (1 << i)) != 0) {
- str.append((char) GENE_SYMBOL[i]);
- }
- }
- return str.toString();
- }
- }
-
- public static String recoverKmerFrom(int k, byte[] keyData, int keyStart,
- int keyLength) {
- StringBuilder strKmer = new StringBuilder();
- int byteId = keyStart + keyLength - 1;
- byte currentbyte = keyData[byteId];
- for (int geneCount = 0; geneCount < k; geneCount++) {
- if (geneCount % 4 == 0 && geneCount > 0) {
- currentbyte = keyData[--byteId];
- }
- strKmer.append((char) GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03]);
- }
- return strKmer.toString();
- }
-
- public static int getByteNumFromK(int k) {
- int x = k / 4;
- if (k % 4 != 0) {
- x += 1;
- }
- return x;
- }
-
- /**
- * Compress Kmer into bytes array AATAG will compress as [0x000G, 0xATAA]
- *
- * @param kmer
- * @param input
- * array
- * @param start
- * position
- * @return initialed kmer array
- */
- public static byte[] compressKmer(int k, byte[] array, int start) {
- final int byteNum = getByteNumFromK(k);
- byte[] bytes = new byte[byteNum];
-
- byte l = 0;
- int bytecount = 0;
- int bcount = byteNum - 1;
- for (int i = start; i < start + k; i++) {
- byte code = GENE_CODE.getCodeFromSymbol(array[i]);
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[0] = l;
- }
- return bytes;
- }
-
- /**
- * Shift Kmer to accept new input
- *
- * @param kmer
- * @param bytes
- * Kmer Array
- * @param c
- * Input new gene character
- * @return the shiftout gene, in gene code format
- */
- public static byte moveKmer(int k, byte[] kmer, byte c) {
- int byteNum = kmer.length;
- byte output = (byte) (kmer[byteNum - 1] & 0x03);
- for (int i = byteNum - 1; i > 0; i--) {
- byte in = (byte) (kmer[i - 1] & 0x03);
- kmer[i] = (byte) (((kmer[i] >>> 2) & 0x3f) | (in << 6));
- }
- int pos = ((k - 1) % 4) << 1;
- byte code = (byte) (GENE_CODE.getCodeFromSymbol(c) << pos);
- kmer[0] = (byte) (((kmer[0] >>> 2) & 0x3f) | code);
- return (byte) (1 << output);
- }
-
- public static byte reverseKmerByte(byte k) {
- int x = (((k >> 2) & 0x33) | ((k << 2) & 0xcc));
- return (byte) (((x >> 4) & 0x0f) | ((x << 4) & 0xf0));
- }
-
- public static byte[] reverseKmer(int k, byte[] kmer) {
- byte[] reverseKmer = new byte[kmer.length];
-
- int curPosAtKmer = ((k - 1) % 4) << 1;
- int curByteAtKmer = 0;
-
- int curPosAtReverse = 0;
- int curByteAtReverse = reverseKmer.length - 1;
- reverseKmer[curByteAtReverse] = 0;
- for (int i = 0; i < k; i++) {
- byte gene = (byte) ((kmer[curByteAtKmer] >> curPosAtKmer) & 0x03);
- reverseKmer[curByteAtReverse] |= gene << curPosAtReverse;
- curPosAtReverse += 2;
- if (curPosAtReverse >= 8) {
- curPosAtReverse = 0;
- reverseKmer[--curByteAtReverse] = 0;
- }
- curPosAtKmer -= 2;
- if (curPosAtKmer < 0) {
- curPosAtKmer = 6;
- curByteAtKmer++;
- }
- }
-
- return reverseKmer;
- }
-
- /**
- * Compress Reversed Kmer into bytes array AATAG will compress as
- * [0x000A,0xATAG]
- *
- * @param kmer
- * @param input
- * array
- * @param start
- * position
- * @return initialed kmer array
- */
- public static byte[] compressKmerReverse(int k, byte[] array, int start) {
- final int byteNum = getByteNumFromK(k);
- byte[] bytes = new byte[byteNum];
-
- byte l = 0;
- int bytecount = 0;
- int bcount = byteNum - 1;
- for (int i = start + k - 1; i >= 0; i--) {
- byte code = GENE_CODE.getCodeFromSymbol(array[i]);
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[0] = l;
- }
- return bytes;
- }
-
- /**
- * Shift Kmer to accept new input
- *
- * @param kmer
- * @param bytes
- * Kmer Array
- * @param c
- * Input new gene character
- * @return the shiftout gene, in gene code format
- */
- public static byte moveKmerReverse(int k, byte[] kmer, byte c) {
- int pos = ((k - 1) % 4) << 1;
- byte output = (byte) ((kmer[0] >> pos) & 0x03);
- for (int i = 0; i < kmer.length - 1; i++) {
- byte in = (byte) ((kmer[i + 1] >> 6) & 0x03);
- kmer[i] = (byte) ((kmer[i] << 2) | in);
- }
- // (k%4) * 2
- if (k % 4 != 0) {
- kmer[0] &= (1 << ((k % 4) << 1)) - 1;
- }
- kmer[kmer.length - 1] = (byte) ((kmer[kmer.length - 1] << 2) | GENE_CODE
- .getCodeFromSymbol(c));
- return (byte) (1 << output);
- }
-
-}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index 1d41167..e8d3e67 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -21,20 +21,47 @@
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
+/**
+ * Fix kmer length byteswritable
+ * It was used to generate the graph in which phase the kmer length doesn't change.
+ * Thus the size of bytes doesn't change either.
+ */
public class KmerBytesWritable extends BinaryComparable implements
WritableComparable<BinaryComparable> {
- private static final int LENGTH_BYTES = 4;
- private static final byte[] EMPTY_BYTES = {};
- private byte size;
- private byte[] bytes;
+ protected int size;
+ protected byte[] bytes;
+ protected int kmerlength;
- public KmerBytesWritable() {
- this(EMPTY_BYTES);
+ /**
+ * Initial Kmer space by kmerlength
+ *
+ * @param k
+ * kmerlength
+ */
+ public KmerBytesWritable(int k) {
+ this.kmerlength = k;
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ this.bytes = new byte[this.size];
}
- public KmerBytesWritable(byte[] bytes) {
- this.bytes = bytes;
- this.size = (byte) bytes.length;
+ public KmerBytesWritable(KmerBytesWritable right) {
+ this.kmerlength = right.kmerlength;
+ this.size = right.size;
+ this.bytes = new byte[right.size];
+ set(right);
+ }
+
+ public byte getGeneCodeAtPosition(int pos) {
+ if (pos >= kmerlength) {
+ return -1;
+ }
+ int posByte = pos / 4;
+ int shift = (pos % 4) << 1;
+ return (byte) ((bytes[size - 1 - posByte] >> shift) & 0x3);
+ }
+
+ public int getKmerLength() {
+ return this.kmerlength;
}
@Override
@@ -42,95 +69,168 @@
return bytes;
}
- @Deprecated
- public byte[] get() {
- return getBytes();
- }
-
@Override
public int getLength() {
- return (int) size;
+ return size;
}
- @Deprecated
- public int getSize() {
- return getLength();
- }
-
- public void setSize(byte size) {
- if ((int) size > getCapacity()) {
- setCapacity((byte) (size * 3 / 2));
- }
- this.size = size;
- }
-
- public int getCapacity() {
- return bytes.length;
- }
-
- public void setCapacity(byte new_cap) {
- if (new_cap != getCapacity()) {
- byte[] new_data = new byte[new_cap];
- if (new_cap < size) {
- size = new_cap;
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = this.size - 1;
+ for (int i = start; i < start + kmerlength; i++) {
+ byte code = GeneCode.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
}
- if (size != 0) {
- System.arraycopy(bytes, 0, new_data, 0, size);
- }
- bytes = new_data;
}
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = size - 1;
+ for (int i = start + kmerlength - 1; i >= 0; i--) {
+ byte code = GeneCode.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ }
+
+ /**
+ * Shift Kmer to accept new char input
+ *
+ * @param c
+ * Input new gene character
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextChar(byte c) {
+ return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextCode(byte c) {
+ byte output = (byte) (bytes[size - 1] & 0x03);
+ for (int i = size - 1; i > 0; i--) {
+ byte in = (byte) (bytes[i - 1] & 0x03);
+ bytes[i] = (byte) (((bytes[i] >>> 2) & 0x3f) | (in << 6));
+ }
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte code = (byte) (c << pos);
+ bytes[0] = (byte) (((bytes[0] >>> 2) & 0x3f) | code);
+ return (byte) (1 << output);
+ }
+
+ /**
+ * Shift Kmer to accept new input char
+ *
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreChar(byte c) {
+ return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreCode(byte c) {
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte output = (byte) ((bytes[0] >> pos) & 0x03);
+ for (int i = 0; i < size - 1; i++) {
+ byte in = (byte) ((bytes[i + 1] >> 6) & 0x03);
+ bytes[i] = (byte) ((bytes[i] << 2) | in);
+ }
+ // (k%4) * 2
+ if (kmerlength % 4 != 0) {
+ bytes[0] &= (1 << ((kmerlength % 4) << 1)) - 1;
+ }
+ bytes[size - 1] = (byte) ((bytes[size - 1] << 2) | c);
+ return (byte) (1 << output);
}
public void set(KmerBytesWritable newData) {
- set(newData.bytes, (byte) 0, newData.size);
+ set(newData.bytes, 0, newData.size);
}
- public void set(byte[] newData, byte offset, byte length) {
- setSize((byte) 0);
- setSize(length);
+ public void set(byte[] newData, int offset, int length) {
System.arraycopy(newData, offset, bytes, 0, size);
}
+ /**
+ * Don't read the kmerlength from datastream,
+ * Read it from configuration
+ */
+ @Override
public void readFields(DataInput in) throws IOException {
- setSize((byte) 0); // clear the old data
- setSize(in.readByte());
in.readFully(bytes, 0, size);
}
@Override
public void write(DataOutput out) throws IOException {
- out.writeByte(size);
out.write(bytes, 0, size);
}
@Override
public int hashCode() {
- return super.hashCode();
+ return super.hashCode() * this.kmerlength;
}
@Override
public boolean equals(Object right_obj) {
if (right_obj instanceof KmerBytesWritable)
- return super.equals(right_obj);
+ return this.kmerlength == ((KmerBytesWritable) right_obj).kmerlength
+ && super.equals(right_obj);
return false;
}
@Override
public String toString() {
- StringBuffer sb = new StringBuffer(3 * size);
- for (int idx = 0; idx < (int) size; idx++) {
- // if not the first, put a blank separator in
- if (idx != 0) {
- sb.append(' ');
- }
- String num = Integer.toHexString(0xff & bytes[idx]);
- // if it is only one digit, add a leading 0.
- if (num.length() < 2) {
- sb.append('0');
- }
- sb.append(num);
- }
- return sb.toString();
+ return KmerUtil.recoverKmerFrom(this.kmerlength, this.getBytes(), 0,
+ this.getLength());
}
public static class Comparator extends WritableComparator {
@@ -139,8 +239,7 @@
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- return compareBytes(b1, s1 + LENGTH_BYTES, l1 - LENGTH_BYTES, b2,
- s2 + LENGTH_BYTES, l2 - LENGTH_BYTES);
+ return compareBytes(b1, s1, l1, b2, s2, l2);
}
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
index 87eaa87..60ad5a3 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
@@ -47,7 +47,7 @@
@Override
public String toString() {
- return Kmer.GENE_CODE.getSymbolFromBitMap(adjBitMap) + '\t'
+ return GeneCode.getSymbolFromBitMap(adjBitMap) + '\t'
+ String.valueOf(count);
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
index 1c8c46e..82fe1a1 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
@@ -1,222 +1,33 @@
package edu.uci.ics.genomix.type;
-import java.util.Arrays;
public class KmerUtil {
- public static int countNumberOfBitSet(int i) {
- int c = 0;
- for (; i != 0; c++) {
- i &= i - 1;
+ public static int getByteNumFromK(int k) {
+ int x = k / 4;
+ if (k % 4 != 0) {
+ x += 1;
}
- return c;
- }
-
- public static int inDegree(byte bitmap) {
- return countNumberOfBitSet((bitmap >> 4) & 0x0f);
- }
-
- public static int outDegree(byte bitmap) {
- return countNumberOfBitSet(bitmap & 0x0f);
- }
-
- /**
- * Get last kmer from kmer-chain.
- * e.g. kmerChain is AAGCTA, if k =5, it will
- * return AGCTA
- * @param k
- * @param kInChain
- * @param kmerChain
- * @return LastKmer bytes array
- */
- public static byte[] getLastKmerFromChain(int k, int kInChain,
- byte[] kmerChain, int offset, int length) {
- if (k > kInChain) {
- return null;
- }
- if (k == kInChain) {
- return kmerChain.clone();
- }
- int byteNum = Kmer.getByteNumFromK(k);
- byte[] kmer = new byte[byteNum];
-
- /** from end to start */
- int byteInChain = length - 1 - (kInChain - k) / 4;
- int posInByteOfChain = ((kInChain - k) % 4) << 1; // *2
- int byteInKmer = byteNum - 1;
- for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
- kmer[byteInKmer] = (byte) ((0xff & kmerChain[offset + byteInChain]) >> posInByteOfChain);
- kmer[byteInKmer] |= ((kmerChain[offset + byteInChain - 1] << (8 - posInByteOfChain)));
- }
-
- /** last kmer byte */
- if (byteInKmer == 0) {
- kmer[0] = (byte) ((kmerChain[offset] & 0xff) >> posInByteOfChain);
- }
- return kmer;
- }
-
- /**
- * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
- * return AAGCT
- *
- * @param k
- * @param kInChain
- * @param kmerChain
- * @return FirstKmer bytes array
- */
- public static byte[] getFirstKmerFromChain(int k, int kInChain,
- byte[] kmerChain, int offset, int length) {
- if (k > kInChain) {
- return null;
- }
- if (k == kInChain) {
- return kmerChain.clone();
- }
- int byteNum = Kmer.getByteNumFromK(k);
- byte[] kmer = new byte[byteNum];
-
- int i = 1;
- for (; i < kmer.length; i++) {
- kmer[kmer.length - i] = kmerChain[offset + length - i];
- }
- int posInByteOfChain = (k % 4) << 1; // *2
- if (posInByteOfChain == 0) {
- kmer[0] = kmerChain[offset + length - i];
- } else {
- kmer[0] = (byte) (kmerChain[offset + length - i] & ((1 << posInByteOfChain) - 1));
- }
- return kmer;
- }
-
- /**
- * Merge kmer with next neighbor in gene-code format.
- * The k of new kmer will increase by 1
- * e.g. AAGCT merge with A => AAGCTA
- * @param k :input k of kmer
- * @param kmer : input bytes of kmer
- * @param nextCode: next neighbor in gene-code format
- * @return the merged Kmer, this K of this Kmer is k+1
- */
- public static byte[] mergeKmerWithNextCode(int k, byte[] kmer, int offset, int length, byte nextCode) {
- int byteNum = length;
- if (k % 4 == 0) {
- byteNum++;
- }
- byte[] mergedKmer = new byte[byteNum];
- for (int i = 1; i <= length; i++) {
- mergedKmer[mergedKmer.length - i] = kmer[offset + length - i];
- }
- if (mergedKmer.length > length) {
- mergedKmer[0] = (byte) (nextCode & 0x3);
- } else {
- mergedKmer[0] = (byte) (kmer[offset] | ((nextCode & 0x3) << ((k % 4) << 1)));
- }
- return mergedKmer;
- }
-
- /**
- * Merge kmer with previous neighbor in gene-code format.
- * The k of new kmer will increase by 1
- * e.g. AAGCT merge with A => AAAGCT
- * @param k :input k of kmer
- * @param kmer : input bytes of kmer
- * @param preCode: next neighbor in gene-code format
- * @return the merged Kmer,this K of this Kmer is k+1
- */
- public static byte[] mergeKmerWithPreCode(int k, byte[] kmer, int offset, int length, byte preCode) {
- int byteNum = length;
- byte[] mergedKmer = null;
- int byteInMergedKmer = 0;
- if (k % 4 == 0) {
- byteNum++;
- mergedKmer = new byte[byteNum];
- mergedKmer[0] = (byte) ((kmer[offset] >> 6) & 0x3);
- byteInMergedKmer++;
- } else {
- mergedKmer = new byte[byteNum];
- }
- for (int i = 0; i < length - 1; i++, byteInMergedKmer++) {
- mergedKmer[byteInMergedKmer] = (byte) ((kmer[offset + i] << 2) | ((kmer[offset + i + 1] >> 6) & 0x3));
- }
- mergedKmer[byteInMergedKmer] = (byte) ((kmer[offset + length - 1] << 2) | (preCode & 0x3));
- return mergedKmer;
- }
-
- /**
- * Merge two kmer to one kmer
- * e.g. ACTA + ACCGT => ACTAACCGT
- * @param preK : previous k of kmer
- * @param kmerPre : bytes array of previous kmer
- * @param nextK : next k of kmer
- * @param kmerNext : bytes array of next kmer
- * @return merged kmer, the new k is @preK + @nextK
- */
- public static byte[] mergeTwoKmer(int preK, byte[] kmerPre, int offsetPre, int lengthPre, int nextK,
- byte[] kmerNext, int offsetNext, int lengthNext) {
- int byteNum = Kmer.getByteNumFromK(preK + nextK);
- byte[] mergedKmer = new byte[byteNum];
- int i = 1;
- for (; i <= lengthPre; i++) {
- mergedKmer[byteNum - i] = kmerPre[offsetPre + lengthPre - i];
- }
- if ( i > 1){
- i--;
- }
- if (preK % 4 == 0) {
- for (int j = 1; j <= lengthNext; j++) {
- mergedKmer[byteNum - i - j] = kmerNext[offsetNext + lengthNext - j];
- }
- } else {
- int posNeedToMove = ((preK % 4) << 1);
- mergedKmer[byteNum - i] |= kmerNext[offsetNext + lengthNext - 1] << posNeedToMove;
- for (int j = 1; j < lengthNext; j++) {
- mergedKmer[byteNum - i - j] = (byte) (((kmerNext[offsetNext + lengthNext
- - j] & 0xff) >> (8 - posNeedToMove)) | (kmerNext[offsetNext + lengthNext
- - j - 1] << posNeedToMove));
- }
- if ( nextK % 4 == 0 || (nextK % 4) * 2 + posNeedToMove > 8) {
- mergedKmer[0] = (byte) ((0xff & kmerNext[offsetNext] )>> (8 - posNeedToMove));
- }
- }
- return mergedKmer;
+ return x;
}
- /**
- * Safely shifted the kmer forward without change the input kmer
- * e.g. AGCGC shift with T => GCGCT
- * @param k: kmer length
- * @param kmer: input kmer
- * @param afterCode: input genecode
- * @return new created kmer that shifted by afterCode, the K will not change
- */
- public static byte[] shiftKmerWithNextCode(int k, final byte[] kmer, int offset, int length, byte afterCode){
- byte[] shifted = Arrays.copyOfRange(kmer, offset, offset+length);
- Kmer.moveKmer(k, shifted, Kmer.GENE_CODE.getSymbolFromCode(afterCode));
- return shifted;
+ public static byte reverseKmerByte(byte k) {
+ int x = (((k >> 2) & 0x33) | ((k << 2) & 0xcc));
+ return (byte) (((x >> 4) & 0x0f) | ((x << 4) & 0xf0));
}
- /**
- * Safely shifted the kmer backward without change the input kmer
- * e.g. AGCGC shift with T => TAGCG
- * @param k: kmer length
- * @param kmer: input kmer
- * @param preCode: input genecode
- * @return new created kmer that shifted by preCode, the K will not change
- */
- public static byte[] shiftKmerWithPreCode(int k, final byte[] kmer, int offset, int length, byte preCode){
- byte[] shifted = Arrays.copyOfRange(kmer, offset, offset+length);
- Kmer.moveKmerReverse(k, shifted, Kmer.GENE_CODE.getSymbolFromCode(preCode));
- return shifted;
- }
-
- public static byte getGeneCodeAtPosition(int pos, int k, final byte[] kmer,
- int offset, int length) {
- if (pos >= k) {
- return -1;
+ public static String recoverKmerFrom(int k, byte[] keyData, int keyStart,
+ int keyLength) {
+ StringBuilder strKmer = new StringBuilder();
+ int byteId = keyStart + keyLength - 1;
+ byte currentbyte = keyData[byteId];
+ for (int geneCount = 0; geneCount < k; geneCount++) {
+ if (geneCount % 4 == 0 && geneCount > 0) {
+ currentbyte = keyData[--byteId];
+ }
+ strKmer.append((char) GeneCode.GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03]);
}
- int posByte = pos / 4;
- int shift = (pos % 4) << 1;
- return (byte) ((kmer[offset + length - 1 - posByte] >> shift) & 0x3);
+ return strKmer.toString();
}
+
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
new file mode 100644
index 0000000..67de889
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -0,0 +1,124 @@
+package edu.uci.ics.genomix.type;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparator;
+
+public class VKmerBytesWritable extends KmerBytesWritable{
+
+ public VKmerBytesWritable(int k) {
+ super(k);
+ }
+
+ public VKmerBytesWritable(KmerBytesWritable other){
+ super(other);
+ }
+
+ public void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity( (size * 3 / 2));
+ }
+ this.size = size;
+ }
+
+ public int getCapacity() {
+ return bytes.length;
+ }
+
+ public void setCapacity(int new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < size) {
+ size = new_cap;
+ }
+ if (size != 0) {
+ System.arraycopy(bytes, 0, new_data, 0, size);
+ }
+ bytes = new_data;
+ }
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(int k, byte[] array, int start) {
+ reset(k);
+ super.setByRead(array, start);
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(int k, byte[] array, int start) {
+ reset(k);
+ super.setByReadReverse(array, start);
+ }
+
+ public void set(KmerBytesWritable newData) {
+ set(newData.kmerlength, newData.bytes, 0, newData.size);
+ }
+
+ public void set(int k, byte[] newData, int offset, int length) {
+ reset(k);
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
+
+ /**
+ * Reset array by kmerlength
+ * @param k
+ */
+ public void reset(int k) {
+ this.kmerlength = k;
+ setSize( 0);
+ setSize( KmerUtil.getByteNumFromK(k));
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ reset(in.readInt());
+ in.readFully(bytes, 0, size);
+ }
+
+ /**
+ * Write the kmer to output
+ * we don't need to output size, since size is related to kmerlength
+ */
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(this.kmerlength);
+ out.write(bytes, 0, size);
+ }
+
+ public static class Comparator extends WritableComparator {
+ public final int LEAD_BYTES = 4;
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = readInt(b1,s1);
+ int kmerlength2 = readInt(b2,s2);
+ if (kmerlength1 == kmerlength2){
+ compareBytes(b1, s1 + LEAD_BYTES, l1-LEAD_BYTES, b2, s2+LEAD_BYTES, l2-LEAD_BYTES);
+ }
+ return kmerlength1 - kmerlength2 ;
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
new file mode 100644
index 0000000..a7bcc8b
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
@@ -0,0 +1,241 @@
+package edu.uci.ics.genomix.type;
+
+public class VKmerBytesWritableFactory {
+ private VKmerBytesWritable kmer;
+
+ public VKmerBytesWritableFactory(int k){
+ kmer = new VKmerBytesWritable(k);
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public VKmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
+ kmer.setByRead(k, array, start);
+ return kmer;
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param array
+ * @param start
+ */
+ public VKmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
+ kmer.setByReadReverse(k, array, start);
+ return kmer;
+ }
+
+ /**
+ * Get last kmer from kmer-chain.
+ * e.g. kmerChain is AAGCTA, if k =5, it will
+ * return AGCTA
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return LastKmer bytes array
+ */
+ public VKmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
+ if (lastK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (lastK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(lastK);
+
+ /** from end to start */
+ int byteInChain = kmerChain.getLength() - 1 - (kmerChain.getKmerLength() - lastK) / 4;
+ int posInByteOfChain = ((kmerChain.getKmerLength() - lastK) % 4) << 1; // *2
+ int byteInKmer = kmer.getLength() - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ }
+ return kmer;
+ }
+
+ /**
+ * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
+ * return AAGCT
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return FirstKmer bytes array
+ */
+ public VKmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
+ if (firstK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (firstK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(firstK);
+
+ int i = 1;
+ for (; i < kmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ }
+ int posInByteOfChain = (firstK % 4) << 1; // *2
+ if (posInByteOfChain == 0) {
+ kmer.getBytes()[0] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ } else {
+ kmer.getBytes()[0] = (byte) (kmerChain.getBytes()[kmerChain.getLength() - i] & ((1 << posInByteOfChain) - 1));
+ }
+ return kmer;
+ }
+
+ /**
+ * Merge kmer with next neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAGCTA
+ * @param k :input k of kmer
+ * @param kmer : input bytes of kmer
+ * @param nextCode: next neighbor in gene-code format
+ * @return the merged Kmer, this K of this Kmer is k+1
+ */
+ public VKmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
+ this.kmer.reset(kmer.getKmerLength()+1);
+ for (int i = 1; i <= kmer.getLength(); i++) {
+ this.kmer.getBytes()[this.kmer.getLength() - i] = kmer.getBytes()[kmer.getLength() - i];
+ }
+ if (this.kmer.getLength() > kmer.getLength()) {
+ this.kmer.getBytes()[0] = (byte) (nextCode & 0x3);
+ } else {
+ this.kmer.getBytes()[0] = (byte) (kmer.getBytes()[0] | ((nextCode & 0x3) << ((kmer.getKmerLength() % 4) << 1)));
+ }
+ return this.kmer;
+ }
+
+ /**
+ * Merge kmer with previous neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAAGCT
+ * @param k :input k of kmer
+ * @param kmer : input bytes of kmer
+ * @param preCode: next neighbor in gene-code format
+ * @return the merged Kmer,this K of this Kmer is k+1
+ */
+ public VKmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ this.kmer.reset(kmer.getKmerLength()+1);
+ int byteInMergedKmer = 0;
+ if (kmer.getKmerLength() % 4 == 0) {
+ this.kmer.getBytes()[0] = (byte) ((kmer.getBytes()[0] >> 6) & 0x3);
+ byteInMergedKmer++;
+ }
+ for (int i = 0; i < kmer.getLength() - 1; i++, byteInMergedKmer++) {
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[i] << 2) | ((kmer.getBytes()[ i + 1] >> 6) & 0x3));
+ }
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[kmer.getLength() - 1] << 2) | (preCode & 0x3));
+ return this.kmer;
+ }
+
+ /**
+ * Merge two kmer to one kmer
+ * e.g. ACTA + ACCGT => ACTAACCGT
+ * @param preK : previous k of kmer
+ * @param kmerPre : bytes array of previous kmer
+ * @param nextK : next k of kmer
+ * @param kmerNext : bytes array of next kmer
+ * @return merged kmer, the new k is @preK + @nextK
+ */
+ public VKmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
+ kmer.reset(preKmer.getKmerLength() + nextKmer.getKmerLength());
+ int i = 1;
+ for (; i <= preKmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = preKmer.getBytes()[preKmer.getLength() - i];
+ }
+ if ( i > 1){
+ i--;
+ }
+ if (preKmer.getKmerLength() % 4 == 0) {
+ for (int j = 1; j <= nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = nextKmer.getBytes()[nextKmer.getLength() - j];
+ }
+ } else {
+ int posNeedToMove = ((preKmer.getKmerLength() % 4) << 1);
+ kmer.getBytes()[kmer.getLength() - i] |= nextKmer.getBytes()[ nextKmer.getLength() - 1] << posNeedToMove;
+ for (int j = 1; j < nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = (byte) (((nextKmer.getBytes()[ nextKmer.getLength()
+ - j] & 0xff) >> (8 - posNeedToMove)) | (nextKmer.getBytes()[nextKmer.getLength()
+ - j - 1] << posNeedToMove));
+ }
+ if ( nextKmer.getKmerLength() % 4 == 0 || (nextKmer.getKmerLength() % 4) * 2 + posNeedToMove > 8) {
+ kmer.getBytes()[0] = (byte) ((0xff & nextKmer.getBytes()[0] )>> (8 - posNeedToMove));
+ }
+ }
+ return kmer;
+ }
+
+ /**
+ * Safely shifted the kmer forward without change the input kmer
+ * e.g. AGCGC shift with T => GCGCT
+ * @param k: kmer length
+ * @param kmer: input kmer
+ * @param afterCode: input genecode
+ * @return new created kmer that shifted by afterCode, the K will not change
+ */
+ public VKmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode){
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithNextCode(afterCode);
+ return this.kmer;
+ }
+
+ /**
+ * Safely shifted the kmer backward without change the input kmer
+ * e.g. AGCGC shift with T => TAGCG
+ * @param k: kmer length
+ * @param kmer: input kmer
+ * @param preCode: input genecode
+ * @return new created kmer that shifted by preCode, the K will not change
+ */
+ public VKmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode){
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithPreCode(preCode);
+ return this.kmer;
+ }
+
+ /**
+ * get the reverse sequence of given kmer
+ * @param kmer
+ */
+ public VKmerBytesWritable reverse(final KmerBytesWritable kmer) {
+ this.kmer.reset(kmer.getKmerLength());
+
+ int curPosAtKmer = ((kmer.getKmerLength() - 1) % 4) << 1;
+ int curByteAtKmer = 0;
+
+ int curPosAtReverse = 0;
+ int curByteAtReverse = this.kmer.getLength() - 1;
+ this.kmer.getBytes()[curByteAtReverse] = 0;
+ for (int i = 0; i < kmer.getKmerLength(); i++) {
+ byte gene = (byte) ((kmer.getBytes()[curByteAtKmer] >> curPosAtKmer) & 0x03);
+ this.kmer.getBytes()[curByteAtReverse] |= gene << curPosAtReverse;
+ curPosAtReverse += 2;
+ if (curPosAtReverse >= 8) {
+ curPosAtReverse = 0;
+ this.kmer.getBytes()[--curByteAtReverse] = 0;
+ }
+ curPosAtKmer -= 2;
+ if (curPosAtKmer < 0) {
+ curPosAtKmer = 6;
+ curByteAtKmer++;
+ }
+ }
+ return this.kmer;
+ }
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
new file mode 100644
index 0000000..ea1d0c2
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
@@ -0,0 +1,80 @@
+package edu.uci.ics.genomix.example.kmer;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+public class KmerBytesWritableTest {
+ static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
+ static int k = 7;
+
+ @Test
+ public void TestCompressKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead( array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByRead( array, 1);
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestMoveKmer(){
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead( array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length-1; i++) {
+ kmer.shiftKmerWithNextCode(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithNextChar( array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getAdjBit((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+
+ @Test
+ public void TestCompressKmerReverse() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead( array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByReadReverse( array, 1);
+ Assert.assertEquals(kmer.toString(), "GAAGATA");
+ }
+
+ @Test
+ public void TestMoveKmerReverse(){
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead( array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length-1; i++) {
+ kmer.shiftKmerWithPreChar( array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithPreChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getAdjBit((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "GAATAGA");
+ }
+
+ @Test
+ public void TestGetGene(){
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ String text = "AGCTGACCG";
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G' };
+ kmer.setByRead( array, 0);
+
+ for(int i =0; i < 9; i++){
+ Assert.assertEquals(text.charAt(i),
+ (char)(GeneCode.getSymbolFromCode(kmer.getGeneCodeAtPosition(i))));
+ }
+ }
+
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java
deleted file mode 100644
index d9c1846..0000000
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java
+++ /dev/null
@@ -1,81 +0,0 @@
-package edu.uci.ics.genomix.example.kmer;
-
-import junit.framework.Assert;
-
-import org.junit.Test;
-
-import edu.uci.ics.genomix.type.Kmer;
-
-public class KmerTest {
- static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
- static int k = 7;
-
- @Test
- public void TestCompressKmer() {
- byte[] kmer = Kmer.compressKmer(k, array, 0);
- String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "AATAGAA");
-
- kmer = Kmer.compressKmer(k, array, 1);
- result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "ATAGAAG");
- }
-
- @Test
- public void TestMoveKmer(){
- byte[] kmer = Kmer.compressKmer(k, array, 0);
- String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "AATAGAA");
-
- for (int i = k; i < array.length-1; i++) {
- Kmer.moveKmer(k, kmer, array[i]);
- Assert.assertTrue(false);
- }
-
- byte out = Kmer.moveKmer(k, kmer, array[array.length - 1]);
- Assert.assertEquals(out, Kmer.GENE_CODE.getAdjBit((byte) 'A'));
- result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "ATAGAAG");
- }
-
-
- @Test
- public void TestReverseKmer(){
- byte[] kmer = Kmer.compressKmer(k, array, 0);
- String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "AATAGAA");
- byte[] reversed = Kmer.reverseKmer(k, kmer);
- result = Kmer.recoverKmerFrom(k, reversed, 0, kmer.length);
- Assert.assertEquals(result, "AAGATAA");
- }
-
- @Test
- public void TestCompressKmerReverse() {
- byte[] kmer = Kmer.compressKmerReverse(k, array, 0);
- String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "AAGATAA");
-
- kmer = Kmer.compressKmerReverse(k, array, 1);
- result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "GAAGATA");
- }
-
- @Test
- public void TestMoveKmerReverse(){
- byte[] kmer = Kmer.compressKmerReverse(k, array, 0);
- String result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "AAGATAA");
-
- for (int i = k; i < array.length-1; i++) {
- Kmer.moveKmerReverse(k, kmer, array[i]);
- Assert.assertTrue(false);
- }
-
- byte out = Kmer.moveKmerReverse(k, kmer, array[array.length - 1]);
- Assert.assertEquals(out, Kmer.GENE_CODE.getAdjBit((byte) 'A'));
- result = Kmer.recoverKmerFrom(k, kmer, 0, kmer.length);
- Assert.assertEquals(result, "GAAGATA");
- }
-
-
-}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
deleted file mode 100644
index 854bb47..0000000
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
+++ /dev/null
@@ -1,138 +0,0 @@
-package edu.uci.ics.genomix.example.kmer;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import edu.uci.ics.genomix.type.Kmer;
-import edu.uci.ics.genomix.type.KmerUtil;
-
-public class KmerUtilTest {
- static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G','T'};
-
- @Test
- public void TestDegree(){
- Assert.assertTrue(KmerUtil.inDegree((byte) 0xff) == 4);
- Assert.assertTrue(KmerUtil.outDegree((byte) 0xff) == 4);
- Assert.assertTrue(KmerUtil.inDegree((byte) 0x3f) == 2);
- Assert.assertTrue(KmerUtil.outDegree((byte) 0x01) == 1);
- Assert.assertTrue(KmerUtil.inDegree((byte) 0x01) == 0);
- }
-
- @Test
- public void TestGetLastKmer(){
- byte[] kmerChain = Kmer.compressKmer(9, array, 0);
- Assert.assertEquals("AGCTGACCG", Kmer.recoverKmerFrom(9, kmerChain, 0, kmerChain.length));
- byte[] lastKmer ;
- for(int i = 8; i>0 ; i--){
- lastKmer = KmerUtil.getLastKmerFromChain(i, 9, kmerChain, 0, kmerChain.length);
-// System.out.println(Kmer.recoverKmerFrom(i, lastKmer, 0, lastKmer.length));
- Assert.assertEquals("AGCTGACCG".substring(9-i), Kmer.recoverKmerFrom(i, lastKmer, 0, lastKmer.length));
- }
- }
-
- @Test
- public void TestMergeNext(){
- byte[] kmer = Kmer.compressKmer(9, array, 0);
- String text = "AGCTGACCG";
- Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
- for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
- kmer = KmerUtil.mergeKmerWithNextCode(9+x, kmer, 0, kmer.length, x);
-// System.out.println(Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
- text = text + (char)Kmer.GENE_SYMBOL[x];
- Assert.assertEquals(text, Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
- }
- for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
- kmer = KmerUtil.mergeKmerWithNextCode(13+x, kmer,0, kmer.length, x);
-// System.out.println(Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
- text = text + (char)Kmer.GENE_SYMBOL[x];
- Assert.assertEquals(text, Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
- }
- }
-
- @Test
- public void TestMergePre(){
- byte[] kmer = Kmer.compressKmer(9, array, 0);
- String text = "AGCTGACCG";
- Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
- for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
- kmer = KmerUtil.mergeKmerWithPreCode(9+x, kmer, 0, kmer.length,x);
-// System.out.println(Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
- text = (char)Kmer.GENE_SYMBOL[x] + text;
- Assert.assertEquals(text , Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
- }
- for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
- kmer = KmerUtil.mergeKmerWithPreCode(13+x, kmer,0, kmer.length, x);
-// System.out.println(Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
- text = (char)Kmer.GENE_SYMBOL[x] + text;
- Assert.assertEquals(text , Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
- }
- }
-
- @Test
- public void TestMergeTwoKmer(){
- byte[] kmer1 = Kmer.compressKmer(9, array, 0);
- String text1 = "AGCTGACCG";
- byte[] kmer2 = Kmer.compressKmer(9, array, 1);
- String text2 = "GCTGACCGT";
- Assert.assertEquals(text1, Kmer.recoverKmerFrom(9, kmer1, 0, kmer1.length));
- Assert.assertEquals(text2, Kmer.recoverKmerFrom(9, kmer2, 0, kmer2.length));
-
- byte[] merged = KmerUtil.mergeTwoKmer(9, kmer1,0,kmer1.length, 9, kmer2,0,kmer2.length);
- Assert.assertEquals(text1+text2, Kmer.recoverKmerFrom(9+9, merged, 0, merged.length));
-
- byte[] kmer3 = Kmer.compressKmer(3, array, 1);
- String text3 = "GCT";
- Assert.assertEquals(text3, Kmer.recoverKmerFrom(3, kmer3, 0, kmer3.length));
- merged = KmerUtil.mergeTwoKmer(9, kmer1, 0 , kmer1.length, 3, kmer3, 0, kmer3.length);
- Assert.assertEquals(text1+text3, Kmer.recoverKmerFrom(9+3, merged, 0, merged.length));
- merged = KmerUtil.mergeTwoKmer(3, kmer3, 0 , kmer3.length, 9, kmer1, 0, kmer1.length);
- Assert.assertEquals(text3+text1, Kmer.recoverKmerFrom(9+3, merged, 0, merged.length));
-
- byte[] kmer4 = Kmer.compressKmer(8, array, 0);
- String text4 = "AGCTGACC";
- Assert.assertEquals(text4, Kmer.recoverKmerFrom(8, kmer4, 0, kmer4.length));
- merged = KmerUtil.mergeTwoKmer(8, kmer4, 0, kmer4.length, 3, kmer3, 0, kmer3.length);
- Assert.assertEquals(text4+text3, Kmer.recoverKmerFrom(8+3, merged, 0, merged.length));
-
- byte[] kmer5 = Kmer.compressKmer(7, array, 0);
- String text5 = "AGCTGAC";
- byte[] kmer6 = Kmer.compressKmer(9, array, 1);
- String text6 = "GCTGACCGT";
- merged = KmerUtil.mergeTwoKmer(7, kmer5, 0, kmer5.length,9, kmer6, 0, kmer6.length);
- Assert.assertEquals(text5+text6, Kmer.recoverKmerFrom(7+9, merged, 0, merged.length));
-
- byte[] kmer7 = Kmer.compressKmer(6, array, 1);
- String text7 = "GCTGAC";
- merged = KmerUtil.mergeTwoKmer(7, kmer5, 0, kmer5.length, 6, kmer7, 0, kmer7.length);
- Assert.assertEquals(text5+text7, Kmer.recoverKmerFrom(7+6, merged, 0, merged.length));
-
- byte[] kmer8 = Kmer.compressKmer(4, array, 1);
- String text8 = "GCTG";
- merged = KmerUtil.mergeTwoKmer(7, kmer5, 0, kmer5.length, 4, kmer8, 0, kmer8.length);
- Assert.assertEquals(text5+text8, Kmer.recoverKmerFrom(7+4, merged, 0, merged.length));
-
- }
- @Test
- public void TestShift(){
- byte[] kmer = Kmer.compressKmer(9, array, 0);
- String text = "AGCTGACCG";
- Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
-
- byte [] kmerForward = KmerUtil.shiftKmerWithNextCode(9, kmer,0, kmer.length, Kmer.GENE_CODE.A);
- Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
- Assert.assertEquals("GCTGACCGA", Kmer.recoverKmerFrom(9, kmerForward, 0, kmerForward.length));
- byte [] kmerBackward = KmerUtil.shiftKmerWithPreCode(9, kmer,0, kmer.length,Kmer.GENE_CODE.C);
- Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
- Assert.assertEquals("CAGCTGACC", Kmer.recoverKmerFrom(9, kmerBackward, 0, kmerBackward.length));
-
- }
- @Test
- public void TestGetGene(){
- byte[] kmer = Kmer.compressKmer(9, array, 0);
- String text = "AGCTGACCG";
- for(int i =0; i < 9; i++){
- Assert.assertEquals(text.charAt(i),
- (char)(Kmer.GENE_CODE.getSymbolFromCode(KmerUtil.getGeneCodeAtPosition(i, 9, kmer, 0, kmer.length))));
- }
- }
-}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
new file mode 100644
index 0000000..a0b8845
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
@@ -0,0 +1,158 @@
+package edu.uci.ics.genomix.example.kmer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+public class VKmerBytesWritableFactoryTest {
+ static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G','T'};
+
+ VKmerBytesWritableFactory kmerFactory = new VKmerBytesWritableFactory(8);
+
+ @Test
+ public void TestDegree(){
+ Assert.assertTrue(GeneCode.inDegree((byte) 0xff) == 4);
+ Assert.assertTrue(GeneCode.outDegree((byte) 0xff) == 4);
+ Assert.assertTrue(GeneCode.inDegree((byte) 0x3f) == 2);
+ Assert.assertTrue(GeneCode.outDegree((byte) 0x01) == 1);
+ Assert.assertTrue(GeneCode.inDegree((byte) 0x01) == 0);
+ }
+
+ @Test
+ public void TestGetLastKmer(){
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead( array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ KmerBytesWritable lastKmer ;
+ for(int i = 8; i>0 ; i--){
+ lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9-i), lastKmer.toString());
+ }
+ VKmerBytesWritable vlastKmer ;
+ for(int i = 8; i>0 ; i--){
+ vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9-i), vlastKmer.toString());
+ }
+ }
+
+ @Test
+ public void TestMergeNext(){
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+
+ String text = "AGCTGACCG";
+ for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ text = text + (char)GeneCode.GENE_SYMBOL[x];
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ text = text + (char)GeneCode.GENE_SYMBOL[x];
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ }
+
+ @Test
+ public void TestMergePre(){
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ String text = "AGCTGACCG";
+ for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ text = (char)GeneCode.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text , newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ text = (char)GeneCode.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text , newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ }
+
+ @Test
+ public void TestMergeTwoKmer(){
+ KmerBytesWritable kmer1 = new KmerBytesWritable(9);
+ kmer1.setByRead( array, 0);
+ String text1 = "AGCTGACCG";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(9);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCGT";
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+
+ KmerBytesWritable merged = kmerFactory.mergeTwoKmer(kmer1, kmer2);
+ Assert.assertEquals(text1+text2, merged.toString());
+
+ KmerBytesWritable kmer3 = new KmerBytesWritable(3);
+ kmer3.setByRead(array, 1);
+ String text3 = "GCT";
+ Assert.assertEquals(text3, kmer3.toString());
+
+ merged = kmerFactory.mergeTwoKmer(kmer1, kmer3);
+ Assert.assertEquals(text1+text3, merged.toString());
+ merged = kmerFactory.mergeTwoKmer( kmer3, kmer1);
+ Assert.assertEquals(text3+text1, merged.toString());
+
+ KmerBytesWritable kmer4 = new KmerBytesWritable(8);
+ kmer4.setByRead( array, 0);
+ String text4 = "AGCTGACC";
+ Assert.assertEquals(text4, kmer4.toString());
+ merged = kmerFactory.mergeTwoKmer(kmer4, kmer3);
+ Assert.assertEquals(text4+text3, merged.toString());
+
+ KmerBytesWritable kmer5 = new KmerBytesWritable(7);
+ kmer5.setByRead( array, 0);
+ String text5 = "AGCTGAC";
+ VKmerBytesWritable kmer6 = new VKmerBytesWritable(9);
+ kmer6.setByRead(9, array, 1);
+ String text6 = "GCTGACCGT";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5+text6, merged.toString());
+
+ kmer6.setByRead(6, array, 1);
+ String text7 = "GCTGAC";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5+text7, merged.toString());
+
+ kmer6.setByRead(4, array, 1);
+ String text8 = "GCTG";
+ merged = kmerFactory.mergeTwoKmer( kmer5, kmer6);
+ Assert.assertEquals(text5+text8, merged.toString());
+
+ }
+ @Test
+ public void TestShift(){
+ VKmerBytesWritable kmer = new VKmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
+ String text = "AGCTGACCG";
+ Assert.assertEquals(text, kmer.toString());
+
+ VKmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer,GeneCode.A);
+ Assert.assertEquals(text, kmer.toString());
+ Assert.assertEquals("GCTGACCGA", kmerForward.toString());
+ VKmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer,GeneCode.C);
+ Assert.assertEquals(text, kmer.toString());
+ Assert.assertEquals("CAGCTGACC", kmerBackward.toString());
+
+ }
+
+
+ @Test
+ public void TestReverseKmer(){
+ KmerBytesWritable kmer = new KmerBytesWritable(7);
+ kmer.setByRead( array, 0);
+ Assert.assertEquals(kmer.toString(), "AGCTGAC");
+ KmerBytesWritable reversed = kmerFactory.reverse(kmer);
+ Assert.assertEquals(reversed.toString(), "CAGTCGA");
+ }
+}