Merge branch 'fullstack_genomix' of https://code.google.com/p/hyracks into fullstack_genomix
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index 5d1cc11..8b9f9bd 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -264,12 +264,19 @@
}
public static class Comparator extends WritableComparator {
+ public final int LEAD_BYTES = 4;
+
public Comparator() {
super(KmerBytesWritable.class);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- return compareBytes(b1, s1, l1, b2, s2, l2);
+ int kmerlength1 = readInt(b1, s1);
+ int kmerlength2 = readInt(b2, s2);
+ if (kmerlength1 == kmerlength2) {
+ return compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
+ }
+ return kmerlength1 - kmerlength2;
}
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
index 9dc1dde..59cc3b0 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
@@ -1,6 +1,7 @@
package edu.uci.ics.genomix.type;
public class KmerUtil {
+ public static final String empty = "";
public static int getByteNumFromK(int k) {
int x = k / 4;
@@ -18,6 +19,9 @@
public static String recoverKmerFrom(int k, byte[] keyData, int keyStart, int keyLength) {
StringBuilder strKmer = new StringBuilder();
int byteId = keyStart + keyLength - 1;
+ if (byteId < 0){
+ return empty;
+ }
byte currentbyte = keyData[byteId];
for (int geneCount = 0; geneCount < k; geneCount++) {
if (geneCount % 4 == 0 && geneCount > 0) {
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
index 493191a..a66d408 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -1,7 +1,5 @@
package edu.uci.ics.genomix.type;
-import org.apache.hadoop.io.WritableComparator;
-
public class VKmerBytesWritable extends KmerBytesWritable {
/**
*
@@ -96,24 +94,4 @@
setSize(KmerUtil.getByteNumFromK(k));
}
- public static class Comparator extends WritableComparator {
- public final int LEAD_BYTES = 4;
-
- public Comparator() {
- super(KmerBytesWritable.class);
- }
-
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- int kmerlength1 = readInt(b1, s1);
- int kmerlength2 = readInt(b2, s2);
- if (kmerlength1 == kmerlength2) {
- compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
- }
- return kmerlength1 - kmerlength2;
- }
- }
-
- static { // register this comparator
- WritableComparator.define(KmerBytesWritable.class, new Comparator());
- }
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
index dfc0ee3..ab1e633 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
@@ -100,6 +100,35 @@
return kmer;
}
+ public VKmerBytesWritable getSubKmerFromChain(int startK, int kSize, final KmerBytesWritable kmerChain) {
+ if (startK + kSize > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (startK == 0 && kSize == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(kSize);
+
+ /** from end to start */
+ int byteInChain = kmerChain.getLength() - 1 - startK / 4;
+ int posInByteOfChain = startK % 4 << 1; // *2
+ int byteInKmer = kmer.getLength() - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ }
+ if (kSize % 4 != 0) {
+ kmer.getBytes()[0] &= (1 << ((kSize % 4) << 1)) - 1;
+ }
+ return kmer;
+ }
+
/**
* Merge kmer with next neighbor in gene-code format.
* The k of new kmer will increase by 1
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
index c40729c..6611752 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
@@ -31,11 +31,50 @@
for (int i = 8; i > 0; i--) {
lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
+ lastKmer = kmerFactory.getSubKmerFromChain(9-i, i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
}
VKmerBytesWritable vlastKmer;
for (int i = 8; i > 0; i--) {
vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
+ vlastKmer = kmerFactory.getSubKmerFromChain(9-i, i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
+ }
+ }
+
+ @Test
+ public void TestGetFirstKmer(){
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ KmerBytesWritable firstKmer;
+ for (int i = 8; i > 0; i--) {
+ firstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0,i), firstKmer.toString());
+ firstKmer = kmerFactory.getSubKmerFromChain(0,i,kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0,i), firstKmer.toString());
+ }
+ VKmerBytesWritable vfirstKmer;
+ for (int i = 8; i > 0; i--) {
+ vfirstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0,i), vfirstKmer.toString());
+ vfirstKmer = kmerFactory.getSubKmerFromChain(0, i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0,i), vfirstKmer.toString());
+ }
+ }
+
+ @Test
+ public void TestGetSubKmer(){
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ VKmerBytesWritable subKmer;
+ for (int istart = 0; istart < kmer.getKmerLength()-1; istart++) {
+ for(int isize = 1; isize + istart <= kmer.getKmerLength(); isize ++){
+ subKmer = kmerFactory.getSubKmerFromChain(istart, isize, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(istart, istart+isize), subKmer.toString());
+ }
}
}