Merge branch 'fullstack_genomix' of https://code.google.com/p/hyracks into fullstack_genomix
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index 5d1cc11..8b9f9bd 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -264,12 +264,19 @@
     }
 
     public static class Comparator extends WritableComparator {
+        public final int LEAD_BYTES = 4;
+
         public Comparator() {
             super(KmerBytesWritable.class);
         }
 
         public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
-            return compareBytes(b1, s1, l1, b2, s2, l2);
+            int kmerlength1 = readInt(b1, s1);
+            int kmerlength2 = readInt(b2, s2);
+            if (kmerlength1 == kmerlength2) {
+                return compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
+            }
+            return kmerlength1 - kmerlength2;
         }
     }
 
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
index 9dc1dde..59cc3b0 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
@@ -1,6 +1,7 @@
 package edu.uci.ics.genomix.type;
 
 public class KmerUtil {
+    public static final String empty = "";
 
     public static int getByteNumFromK(int k) {
         int x = k / 4;
@@ -18,6 +19,9 @@
     public static String recoverKmerFrom(int k, byte[] keyData, int keyStart, int keyLength) {
         StringBuilder strKmer = new StringBuilder();
         int byteId = keyStart + keyLength - 1;
+        if (byteId < 0){
+            return empty;
+        }
         byte currentbyte = keyData[byteId];
         for (int geneCount = 0; geneCount < k; geneCount++) {
             if (geneCount % 4 == 0 && geneCount > 0) {
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
index 493191a..a66d408 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -1,7 +1,5 @@
 package edu.uci.ics.genomix.type;
 
-import org.apache.hadoop.io.WritableComparator;
-
 public class VKmerBytesWritable extends KmerBytesWritable {
     /**
      * 
@@ -96,24 +94,4 @@
         setSize(KmerUtil.getByteNumFromK(k));
     }
 
-    public static class Comparator extends WritableComparator {
-        public final int LEAD_BYTES = 4;
-
-        public Comparator() {
-            super(KmerBytesWritable.class);
-        }
-
-        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
-            int kmerlength1 = readInt(b1, s1);
-            int kmerlength2 = readInt(b2, s2);
-            if (kmerlength1 == kmerlength2) {
-                compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
-            }
-            return kmerlength1 - kmerlength2;
-        }
-    }
-
-    static { // register this comparator
-        WritableComparator.define(KmerBytesWritable.class, new Comparator());
-    }
 }
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
index dfc0ee3..ab1e633 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
@@ -100,6 +100,35 @@
         return kmer;
     }
 
+    public VKmerBytesWritable getSubKmerFromChain(int startK, int kSize, final KmerBytesWritable kmerChain) {
+        if (startK + kSize > kmerChain.getKmerLength()) {
+            return null;
+        }
+        if (startK == 0 && kSize == kmerChain.getKmerLength()) {
+            kmer.set(kmerChain);
+            return kmer;
+        }
+        kmer.reset(kSize);
+
+        /** from end to start */
+        int byteInChain = kmerChain.getLength() - 1 - startK / 4;
+        int posInByteOfChain = startK % 4 << 1; // *2
+        int byteInKmer = kmer.getLength() - 1;
+        for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+            kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+            kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+        }
+
+        /** last kmer byte */
+        if (byteInKmer == 0) {
+            kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+        }
+        if (kSize % 4 != 0) {
+            kmer.getBytes()[0] &= (1 << ((kSize % 4) << 1)) - 1;
+        }
+        return kmer;
+    }
+
     /**
      * Merge kmer with next neighbor in gene-code format.
      * The k of new kmer will increase by 1
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
index c40729c..6611752 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
@@ -31,11 +31,50 @@
         for (int i = 8; i > 0; i--) {
             lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
             Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
+            lastKmer = kmerFactory.getSubKmerFromChain(9-i, i, kmer);
+            Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
         }
         VKmerBytesWritable vlastKmer;
         for (int i = 8; i > 0; i--) {
             vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
             Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
+            vlastKmer = kmerFactory.getSubKmerFromChain(9-i, i, kmer);
+            Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
+        }
+    }
+    
+    @Test
+    public void TestGetFirstKmer(){
+        KmerBytesWritable kmer = new KmerBytesWritable(9);
+        kmer.setByRead(array, 0);
+        Assert.assertEquals("AGCTGACCG", kmer.toString());
+        KmerBytesWritable firstKmer;
+        for (int i = 8; i > 0; i--) {
+            firstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
+            Assert.assertEquals("AGCTGACCG".substring(0,i), firstKmer.toString());
+            firstKmer = kmerFactory.getSubKmerFromChain(0,i,kmer);
+            Assert.assertEquals("AGCTGACCG".substring(0,i), firstKmer.toString());   
+        }
+        VKmerBytesWritable vfirstKmer;
+        for (int i = 8; i > 0; i--) {
+            vfirstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
+            Assert.assertEquals("AGCTGACCG".substring(0,i), vfirstKmer.toString());
+            vfirstKmer = kmerFactory.getSubKmerFromChain(0, i, kmer);
+            Assert.assertEquals("AGCTGACCG".substring(0,i), vfirstKmer.toString());   
+        }
+    }
+    
+    @Test 
+    public void TestGetSubKmer(){
+        KmerBytesWritable kmer = new KmerBytesWritable(9);
+        kmer.setByRead(array, 0);
+        Assert.assertEquals("AGCTGACCG", kmer.toString());
+        VKmerBytesWritable subKmer;
+        for (int istart = 0; istart < kmer.getKmerLength()-1; istart++) {
+            for(int isize = 1; isize + istart <= kmer.getKmerLength(); isize ++){
+                subKmer = kmerFactory.getSubKmerFromChain(istart, isize, kmer);
+                Assert.assertEquals("AGCTGACCG".substring(istart, istart+isize), subKmer.toString());  
+            }
         }
     }