split Kmer into fixsize Kmer and variablesize VKmer
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index e4979ec..e8d3e67 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -21,11 +21,16 @@
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
+/**
+ * Fix kmer length byteswritable
+ * It was used to generate the graph in which phase the kmer length doesn't change.
+ * Thus the size of bytes doesn't change either.
+ */
public class KmerBytesWritable extends BinaryComparable implements
WritableComparable<BinaryComparable> {
- private byte size;
- private byte[] bytes;
- private byte kmerlength;
+ protected int size;
+ protected byte[] bytes;
+ protected int kmerlength;
/**
* Initial Kmer space by kmerlength
@@ -34,15 +39,15 @@
* kmerlength
*/
public KmerBytesWritable(int k) {
- int length = KmerUtil.getByteNumFromK(k);
- this.bytes = new byte[length];
- this.size = (byte) length;
- this.kmerlength = (byte) k;
+ this.kmerlength = k;
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ this.bytes = new byte[this.size];
}
public KmerBytesWritable(KmerBytesWritable right) {
- this.bytes = new byte[right.size];
+ this.kmerlength = right.kmerlength;
this.size = right.size;
+ this.bytes = new byte[right.size];
set(right);
}
@@ -66,31 +71,7 @@
@Override
public int getLength() {
- return (int) size;
- }
-
- public void setSize(byte size) {
- if ((int) size > getCapacity()) {
- setCapacity((byte) (size * 3 / 2));
- }
- this.size = size;
- }
-
- public int getCapacity() {
- return bytes.length;
- }
-
- public void setCapacity(byte new_cap) {
- if (new_cap != getCapacity()) {
- byte[] new_data = new byte[new_cap];
- if (new_cap < size) {
- size = new_cap;
- }
- if (size != 0) {
- System.arraycopy(bytes, 0, new_data, 0, size);
- }
- bytes = new_data;
- }
+ return size;
}
/**
@@ -101,15 +82,11 @@
* @param array
* @param start
*/
- public void setByRead(int k, byte[] array, int start) {
- this.kmerlength = (byte) k;
- setSize((byte) 0);
- setSize((byte) KmerUtil.getByteNumFromK(k));
-
+ public void setByRead(byte[] array, int start) {
byte l = 0;
int bytecount = 0;
int bcount = this.size - 1;
- for (int i = start; i < start + k; i++) {
+ for (int i = start; i < start + kmerlength; i++) {
byte code = GeneCode.getCodeFromSymbol(array[i]);
l |= (byte) (code << bytecount);
bytecount += 2;
@@ -133,15 +110,11 @@
* @param start
* position
*/
- public void setByReadReverse(int k, byte[] array, int start) {
- this.kmerlength = (byte) k;
- setSize((byte) 0);
- setSize((byte) KmerUtil.getByteNumFromK(k));
-
+ public void setByReadReverse(byte[] array, int start) {
byte l = 0;
int bytecount = 0;
int bcount = size - 1;
- for (int i = start + k - 1; i >= 0; i--) {
+ for (int i = start + kmerlength - 1; i >= 0; i--) {
byte code = GeneCode.getCodeFromSymbol(array[i]);
l |= (byte) (code << bytecount);
bytecount += 2;
@@ -220,43 +193,30 @@
}
public void set(KmerBytesWritable newData) {
- set(newData.kmerlength, newData.bytes, (byte) 0, newData.size);
+ set(newData.bytes, 0, newData.size);
}
- public void set(int k, byte[] newData, int offset, int length) {
- this.kmerlength = (byte) k;
- setSize((byte) 0);
- setSize((byte) (length - offset));
+ public void set(byte[] newData, int offset, int length) {
System.arraycopy(newData, offset, bytes, 0, size);
}
/**
- * Reset array by kmerlength
- * @param k
+ * Don't read the kmerlength from datastream,
+ * Read it from configuration
*/
- public void reset(int k) {
- this.kmerlength = (byte) k;
- setSize((byte) 0);
- setSize((byte) KmerUtil.getByteNumFromK(k));
- }
-
+ @Override
public void readFields(DataInput in) throws IOException {
- this.kmerlength = in.readByte();
- setSize((byte) 0); // clear the old data
- setSize(in.readByte());
in.readFully(bytes, 0, size);
}
@Override
public void write(DataOutput out) throws IOException {
- out.writeByte(this.kmerlength);
- out.writeByte(size);
out.write(bytes, 0, size);
}
@Override
public int hashCode() {
- return super.hashCode();
+ return super.hashCode() * this.kmerlength;
}
@Override
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
new file mode 100644
index 0000000..67de889
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -0,0 +1,124 @@
+package edu.uci.ics.genomix.type;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparator;
+
+public class VKmerBytesWritable extends KmerBytesWritable{
+
+ public VKmerBytesWritable(int k) {
+ super(k);
+ }
+
+ public VKmerBytesWritable(KmerBytesWritable other){
+ super(other);
+ }
+
+ public void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity( (size * 3 / 2));
+ }
+ this.size = size;
+ }
+
+ public int getCapacity() {
+ return bytes.length;
+ }
+
+ public void setCapacity(int new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < size) {
+ size = new_cap;
+ }
+ if (size != 0) {
+ System.arraycopy(bytes, 0, new_data, 0, size);
+ }
+ bytes = new_data;
+ }
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(int k, byte[] array, int start) {
+ reset(k);
+ super.setByRead(array, start);
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(int k, byte[] array, int start) {
+ reset(k);
+ super.setByReadReverse(array, start);
+ }
+
+ public void set(KmerBytesWritable newData) {
+ set(newData.kmerlength, newData.bytes, 0, newData.size);
+ }
+
+ public void set(int k, byte[] newData, int offset, int length) {
+ reset(k);
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
+
+ /**
+ * Reset array by kmerlength
+ * @param k
+ */
+ public void reset(int k) {
+ this.kmerlength = k;
+ setSize( 0);
+ setSize( KmerUtil.getByteNumFromK(k));
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ reset(in.readInt());
+ in.readFully(bytes, 0, size);
+ }
+
+ /**
+ * Write the kmer to output
+ * we don't need to output size, since size is related to kmerlength
+ */
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(this.kmerlength);
+ out.write(bytes, 0, size);
+ }
+
+ public static class Comparator extends WritableComparator {
+ public final int LEAD_BYTES = 4;
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = readInt(b1,s1);
+ int kmerlength2 = readInt(b2,s2);
+ if (kmerlength1 == kmerlength2){
+ compareBytes(b1, s1 + LEAD_BYTES, l1-LEAD_BYTES, b2, s2+LEAD_BYTES, l2-LEAD_BYTES);
+ }
+ return kmerlength1 - kmerlength2 ;
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
similarity index 86%
rename from genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
rename to genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
index d29f6bb..a7bcc8b 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
@@ -1,10 +1,10 @@
package edu.uci.ics.genomix.type;
-public class KmerBytesWritableFactory {
- private KmerBytesWritable kmer;
+public class VKmerBytesWritableFactory {
+ private VKmerBytesWritable kmer;
- public KmerBytesWritableFactory(int k){
- kmer = new KmerBytesWritable(k);
+ public VKmerBytesWritableFactory(int k){
+ kmer = new VKmerBytesWritable(k);
}
/**
@@ -15,7 +15,7 @@
* @param array
* @param start
*/
- public KmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
+ public VKmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
kmer.setByRead(k, array, start);
return kmer;
}
@@ -27,7 +27,7 @@
* @param array
* @param start
*/
- public KmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
+ public VKmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
kmer.setByReadReverse(k, array, start);
return kmer;
}
@@ -41,7 +41,7 @@
* @param kmerChain
* @return LastKmer bytes array
*/
- public KmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
+ public VKmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
if (lastK > kmerChain.getKmerLength()) {
return null;
}
@@ -76,7 +76,7 @@
* @param kmerChain
* @return FirstKmer bytes array
*/
- public KmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
+ public VKmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
if (firstK > kmerChain.getKmerLength()) {
return null;
}
@@ -108,7 +108,7 @@
* @param nextCode: next neighbor in gene-code format
* @return the merged Kmer, this K of this Kmer is k+1
*/
- public KmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
+ public VKmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
this.kmer.reset(kmer.getKmerLength()+1);
for (int i = 1; i <= kmer.getLength(); i++) {
this.kmer.getBytes()[this.kmer.getLength() - i] = kmer.getBytes()[kmer.getLength() - i];
@@ -130,7 +130,7 @@
* @param preCode: next neighbor in gene-code format
* @return the merged Kmer,this K of this Kmer is k+1
*/
- public KmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ public VKmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
this.kmer.reset(kmer.getKmerLength()+1);
int byteInMergedKmer = 0;
if (kmer.getKmerLength() % 4 == 0) {
@@ -153,7 +153,7 @@
* @param kmerNext : bytes array of next kmer
* @return merged kmer, the new k is @preK + @nextK
*/
- public KmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
+ public VKmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
kmer.reset(preKmer.getKmerLength() + nextKmer.getKmerLength());
int i = 1;
for (; i <= preKmer.getLength(); i++) {
@@ -189,7 +189,7 @@
* @param afterCode: input genecode
* @return new created kmer that shifted by afterCode, the K will not change
*/
- public KmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode){
+ public VKmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode){
this.kmer.set(kmer);
this.kmer.shiftKmerWithNextCode(afterCode);
return this.kmer;
@@ -203,7 +203,7 @@
* @param preCode: input genecode
* @return new created kmer that shifted by preCode, the K will not change
*/
- public KmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode){
+ public VKmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode){
this.kmer.set(kmer);
this.kmer.shiftKmerWithPreCode(preCode);
return this.kmer;
@@ -213,7 +213,7 @@
* get the reverse sequence of given kmer
* @param kmer
*/
- public KmerBytesWritable reverse(final KmerBytesWritable kmer) {
+ public VKmerBytesWritable reverse(final KmerBytesWritable kmer) {
this.kmer.reset(kmer.getKmerLength());
int curPosAtKmer = ((kmer.getKmerLength() - 1) % 4) << 1;
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
similarity index 85%
rename from genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java
rename to genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
index cedb39a..ea1d0c2 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
@@ -7,24 +7,24 @@
import edu.uci.ics.genomix.type.GeneCode;
import edu.uci.ics.genomix.type.KmerBytesWritable;
-public class KmerTest {
+public class KmerBytesWritableTest {
static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
static int k = 7;
@Test
public void TestCompressKmer() {
KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead(k, array, 0);
+ kmer.setByRead( array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
- kmer.setByRead(k, array, 1);
+ kmer.setByRead( array, 1);
Assert.assertEquals(kmer.toString(), "ATAGAAG");
}
@Test
public void TestMoveKmer(){
KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead(k, array, 0);
+ kmer.setByRead( array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
for (int i = k; i < array.length-1; i++) {
@@ -41,17 +41,17 @@
@Test
public void TestCompressKmerReverse() {
KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead(k, array, 0);
+ kmer.setByRead( array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
- kmer.setByReadReverse(k, array, 1);
+ kmer.setByReadReverse( array, 1);
Assert.assertEquals(kmer.toString(), "GAAGATA");
}
@Test
public void TestMoveKmerReverse(){
KmerBytesWritable kmer = new KmerBytesWritable(k);
- kmer.setByRead(k, array, 0);
+ kmer.setByRead( array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
for (int i = k; i < array.length-1; i++) {
@@ -66,10 +66,10 @@
@Test
public void TestGetGene(){
- KmerBytesWritable kmer = new KmerBytesWritable(k);
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
String text = "AGCTGACCG";
byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G' };
- kmer.setByRead(9, array, 0);
+ kmer.setByRead( array, 0);
for(int i =0; i < 9; i++){
Assert.assertEquals(text.charAt(i),
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
similarity index 81%
rename from genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
rename to genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
index 6959789..a0b8845 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
@@ -5,12 +5,13 @@
import edu.uci.ics.genomix.type.GeneCode;
import edu.uci.ics.genomix.type.KmerBytesWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritableFactory;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
-public class KmerUtilTest {
+public class VKmerBytesWritableFactoryTest {
static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G','T'};
- KmerBytesWritableFactory kmerFactory = new KmerBytesWritableFactory(8);
+ VKmerBytesWritableFactory kmerFactory = new VKmerBytesWritableFactory(8);
@Test
public void TestDegree(){
@@ -24,19 +25,24 @@
@Test
public void TestGetLastKmer(){
KmerBytesWritable kmer = new KmerBytesWritable(9);
- kmer.setByRead(9, array, 0);
+ kmer.setByRead( array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
KmerBytesWritable lastKmer ;
for(int i = 8; i>0 ; i--){
lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
Assert.assertEquals("AGCTGACCG".substring(9-i), lastKmer.toString());
}
+ VKmerBytesWritable vlastKmer ;
+ for(int i = 8; i>0 ; i--){
+ vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9-i), vlastKmer.toString());
+ }
}
@Test
public void TestMergeNext(){
KmerBytesWritable kmer = new KmerBytesWritable(9);
- kmer.setByRead(9, array, 0);
+ kmer.setByRead(array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
String text = "AGCTGACCG";
@@ -57,7 +63,7 @@
@Test
public void TestMergePre(){
KmerBytesWritable kmer = new KmerBytesWritable(9);
- kmer.setByRead(9, array, 0);
+ kmer.setByRead(array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
String text = "AGCTGACCG";
for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
@@ -77,10 +83,10 @@
@Test
public void TestMergeTwoKmer(){
KmerBytesWritable kmer1 = new KmerBytesWritable(9);
- kmer1.setByRead(9, array, 0);
+ kmer1.setByRead( array, 0);
String text1 = "AGCTGACCG";
KmerBytesWritable kmer2 = new KmerBytesWritable(9);
- kmer2.setByRead(9, array, 1);
+ kmer2.setByRead(array, 1);
String text2 = "GCTGACCGT";
Assert.assertEquals(text1, kmer1.toString());
Assert.assertEquals(text2, kmer2.toString());
@@ -89,7 +95,7 @@
Assert.assertEquals(text1+text2, merged.toString());
KmerBytesWritable kmer3 = new KmerBytesWritable(3);
- kmer3.setByRead(3, array, 1);
+ kmer3.setByRead(array, 1);
String text3 = "GCT";
Assert.assertEquals(text3, kmer3.toString());
@@ -99,16 +105,16 @@
Assert.assertEquals(text3+text1, merged.toString());
KmerBytesWritable kmer4 = new KmerBytesWritable(8);
- kmer4.setByRead(8, array, 0);
+ kmer4.setByRead( array, 0);
String text4 = "AGCTGACC";
Assert.assertEquals(text4, kmer4.toString());
merged = kmerFactory.mergeTwoKmer(kmer4, kmer3);
Assert.assertEquals(text4+text3, merged.toString());
KmerBytesWritable kmer5 = new KmerBytesWritable(7);
- kmer5.setByRead(7, array, 0);
+ kmer5.setByRead( array, 0);
String text5 = "AGCTGAC";
- KmerBytesWritable kmer6 = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer6 = new VKmerBytesWritable(9);
kmer6.setByRead(9, array, 1);
String text6 = "GCTGACCGT";
merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
@@ -127,14 +133,14 @@
}
@Test
public void TestShift(){
- KmerBytesWritable kmer = new KmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
+ VKmerBytesWritable kmer = new VKmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
String text = "AGCTGACCG";
Assert.assertEquals(text, kmer.toString());
- KmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer,GeneCode.A);
+ VKmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer,GeneCode.A);
Assert.assertEquals(text, kmer.toString());
Assert.assertEquals("GCTGACCGA", kmerForward.toString());
- KmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer,GeneCode.C);
+ VKmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer,GeneCode.C);
Assert.assertEquals(text, kmer.toString());
Assert.assertEquals("CAGCTGACC", kmerBackward.toString());
@@ -144,7 +150,7 @@
@Test
public void TestReverseKmer(){
KmerBytesWritable kmer = new KmerBytesWritable(7);
- kmer.setByRead(7, array, 0);
+ kmer.setByRead( array, 0);
Assert.assertEquals(kmer.toString(), "AGCTGAC");
KmerBytesWritable reversed = kmerFactory.reverse(kmer);
Assert.assertEquals(reversed.toString(), "CAGTCGA");