add KmerUtil for merge or join operation
git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@3306 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
new file mode 100644
index 0000000..83f7f32
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
@@ -0,0 +1,154 @@
+package edu.uci.ics.genomix.type;
+
+public class KmerUtil {
+
+ public static int countNumberOfBitSet(int i) {
+ int c = 0;
+ for (; i != 0; c++) {
+ i &= i - 1;
+ }
+ return c;
+ }
+
+ public static int inDegree(byte bitmap) {
+ return countNumberOfBitSet((bitmap >> 4) & 0x0f);
+ }
+
+ public static int outDegree(byte bitmap) {
+ return countNumberOfBitSet(bitmap & 0x0f);
+ }
+
+ /**
+ * Get last kmer from kmer-chain. e.g. kmerChain is AAGCTA, if k =5, it will
+ * return AGCTA
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return
+ */
+ public static byte[] getLastKmerFromChain(int k, int kInChain,
+ byte[] kmerChain) {
+ if (k > kInChain) {
+ return null;
+ }
+ if (k == kInChain) {
+ return kmerChain.clone();
+ }
+ int byteNum = Kmer.getByteNumFromK(k);
+ byte[] kmer = new byte[byteNum];
+
+ /** from end to start */
+ int byteInChain = kmerChain.length - 1 - (kInChain - k) / 4;
+ int posInByteOfChain = ((kInChain - k) % 4) << 1; // *2
+ int byteInKmer = byteNum - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer[byteInKmer] = (byte) ((0xff & kmerChain[byteInChain]) >> posInByteOfChain);
+ kmer[byteInKmer] |= ((kmerChain[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer[0] = (byte) ((kmerChain[0] & 0xff) >> posInByteOfChain);
+ }
+ return kmer;
+ }
+
+ /**
+ * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
+ * return AAGCT
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return
+ */
+ public static byte[] getFirstKmerFromChain(int k, int kInChain,
+ byte[] kmerChain) {
+ if (k > kInChain) {
+ return null;
+ }
+ if (k == kInChain) {
+ return kmerChain.clone();
+ }
+ int byteNum = Kmer.getByteNumFromK(k);
+ byte[] kmer = new byte[byteNum];
+
+ int i = 1;
+ for (; i < kmer.length; i++) {
+ kmer[kmer.length - i] = kmerChain[kmerChain.length - i];
+ }
+ int posInByteOfChain = (k % 4) << 1; // *2
+ if (posInByteOfChain == 0) {
+ kmer[0] = kmerChain[kmerChain.length - i];
+ } else {
+ kmer[0] = (byte) (kmerChain[kmerChain.length - i] & ((1 << posInByteOfChain) - 1));
+ }
+ return kmer;
+ }
+
+ public static byte[] mergeKmerWithNextCode(int k, byte[] kmer, byte nextCode) {
+ int byteNum = kmer.length;
+ if (k % 4 == 0) {
+ byteNum++;
+ }
+ byte[] mergedKmer = new byte[byteNum];
+ for (int i = 1; i <= kmer.length; i++) {
+ mergedKmer[mergedKmer.length - i] = kmer[kmer.length - i];
+ }
+ if (mergedKmer.length > kmer.length) {
+ mergedKmer[0] = (byte) (nextCode & 0x3);
+ } else {
+ mergedKmer[0] = (byte) (kmer[0] | ((nextCode & 0x3) << ((k % 4) << 1)));
+ }
+ return mergedKmer;
+ }
+
+ public static byte[] mergeKmerWithPreCode(int k, byte[] kmer, byte preCode) {
+ int byteNum = kmer.length;
+ byte[] mergedKmer = null;
+ int byteInMergedKmer = 0;
+ if (k % 4 == 0) {
+ byteNum++;
+ mergedKmer = new byte[byteNum];
+ mergedKmer[0] = (byte) ((kmer[0] >> 6) & 0x3);
+ byteInMergedKmer++;
+ } else {
+ mergedKmer = new byte[byteNum];
+ }
+ for (int i = 0; i < kmer.length - 1; i++, byteInMergedKmer++) {
+ mergedKmer[byteInMergedKmer] = (byte) ((kmer[i] << 2) | ((kmer[i + 1] >> 6) & 0x3));
+ }
+ mergedKmer[byteInMergedKmer] = (byte) ((kmer[kmer.length - 1] << 2) | (preCode & 0x3));
+ return mergedKmer;
+ }
+
+ public static byte[] mergeTwoKmer(int preK, byte[] kmerPre, int nextK,
+ byte[] kmerNext) {
+ int byteNum = Kmer.getByteNumFromK(preK + nextK);
+ byte[] mergedKmer = new byte[byteNum];
+ int i = 1;
+ for (; i <= kmerPre.length; i++) {
+ mergedKmer[byteNum - i] = kmerPre[kmerPre.length - i];
+ }
+ i--;
+ if (preK % 4 == 0) {
+ for (int j = 1; j <= kmerNext.length; j++) {
+ mergedKmer[byteNum - i - j] = kmerNext[kmerNext.length - j];
+ }
+ } else {
+ int posNeedToMove = ((preK % 4) << 1);
+ mergedKmer[byteNum - i] |= kmerNext[kmerNext.length - 1] << posNeedToMove;
+ for (int j = 1; j < kmerNext.length; j++) {
+ mergedKmer[byteNum - i - j] = (byte) (((kmerNext[kmerNext.length
+ - j] & 0xff) >> (8 - posNeedToMove)) | (kmerNext[kmerNext.length
+ - j - 1] << posNeedToMove));
+ }
+ if ( (nextK % 4) * 2 + posNeedToMove > 8) {
+ mergedKmer[0] = (byte) (kmerNext[0] >> (8 - posNeedToMove));
+ }
+ }
+ return mergedKmer;
+ }
+
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
new file mode 100644
index 0000000..58e4b22
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
@@ -0,0 +1,111 @@
+package edu.uci.ics.genomix.example.kmer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.KmerUtil;
+
+public class KmerUtilTest {
+ static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G','T'};
+
+ @Test
+ public void TestDegree(){
+ Assert.assertTrue(KmerUtil.inDegree((byte) 0xff) == 4);
+ Assert.assertTrue(KmerUtil.outDegree((byte) 0xff) == 4);
+ Assert.assertTrue(KmerUtil.inDegree((byte) 0x3f) == 2);
+ Assert.assertTrue(KmerUtil.outDegree((byte) 0x01) == 1);
+ Assert.assertTrue(KmerUtil.inDegree((byte) 0x01) == 0);
+ }
+
+ @Test
+ public void TestGetLastKmer(){
+ byte[] kmerChain = Kmer.compressKmer(9, array, 0);
+ Assert.assertEquals("AGCTGACCG", Kmer.recoverKmerFrom(9, kmerChain, 0, kmerChain.length));
+ byte[] lastKmer ;
+ for(int i = 8; i>0 ; i--){
+ lastKmer = KmerUtil.getLastKmerFromChain(i, 9, kmerChain);
+// System.out.println(Kmer.recoverKmerFrom(i, lastKmer, 0, lastKmer.length));
+ Assert.assertEquals("AGCTGACCG".substring(9-i), Kmer.recoverKmerFrom(i, lastKmer, 0, lastKmer.length));
+ }
+ }
+
+ @Test
+ public void TestMergeNext(){
+ byte[] kmer = Kmer.compressKmer(9, array, 0);
+ String text = "AGCTGACCG";
+ Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
+ for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+ kmer = KmerUtil.mergeKmerWithNextCode(9+x, kmer, x);
+// System.out.println(Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+ text = text + (char)Kmer.GENE_SYMBOL[x];
+ Assert.assertEquals(text, Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+ }
+ for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+ kmer = KmerUtil.mergeKmerWithNextCode(13+x, kmer, x);
+// System.out.println(Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+ text = text + (char)Kmer.GENE_SYMBOL[x];
+ Assert.assertEquals(text, Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+ }
+ }
+
+ @Test
+ public void TestMergePre(){
+ byte[] kmer = Kmer.compressKmer(9, array, 0);
+ String text = "AGCTGACCG";
+ Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
+ for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+ kmer = KmerUtil.mergeKmerWithPreCode(9+x, kmer, x);
+// System.out.println(Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+ text = (char)Kmer.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text , Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+ }
+ for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+ kmer = KmerUtil.mergeKmerWithPreCode(13+x, kmer, x);
+// System.out.println(Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+ text = (char)Kmer.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text , Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+ }
+ }
+
+ @Test
+ public void TestMergeTwoKmer(){
+ byte[] kmer1 = Kmer.compressKmer(9, array, 0);
+ String text1 = "AGCTGACCG";
+ byte[] kmer2 = Kmer.compressKmer(9, array, 1);
+ String text2 = "GCTGACCGT";
+ Assert.assertEquals(text1, Kmer.recoverKmerFrom(9, kmer1, 0, kmer1.length));
+ Assert.assertEquals(text2, Kmer.recoverKmerFrom(9, kmer2, 0, kmer2.length));
+
+ byte[] merged = KmerUtil.mergeTwoKmer(9, kmer1, 9, kmer2);
+ Assert.assertEquals(text1+text2, Kmer.recoverKmerFrom(9+9, merged, 0, merged.length));
+
+ byte[] kmer3 = Kmer.compressKmer(3, array, 1);
+ String text3 = "GCT";
+ Assert.assertEquals(text3, Kmer.recoverKmerFrom(3, kmer3, 0, kmer3.length));
+ merged = KmerUtil.mergeTwoKmer(9, kmer1, 3, kmer3);
+ Assert.assertEquals(text1+text3, Kmer.recoverKmerFrom(9+3, merged, 0, merged.length));
+ merged = KmerUtil.mergeTwoKmer(3, kmer3, 9, kmer1);
+ Assert.assertEquals(text3+text1, Kmer.recoverKmerFrom(9+3, merged, 0, merged.length));
+
+ byte[] kmer4 = Kmer.compressKmer(8, array, 0);
+ String text4 = "AGCTGACC";
+ Assert.assertEquals(text4, Kmer.recoverKmerFrom(8, kmer4, 0, kmer4.length));
+ merged = KmerUtil.mergeTwoKmer(8, kmer4, 3, kmer3);
+ Assert.assertEquals(text4+text3, Kmer.recoverKmerFrom(8+3, merged, 0, merged.length));
+
+ byte[] kmer5 = Kmer.compressKmer(7, array, 0);
+ String text5 = "AGCTGAC";
+ byte[] kmer6 = Kmer.compressKmer(9, array, 1);
+ String text6 = "GCTGACCGT";
+ merged = KmerUtil.mergeTwoKmer(7, kmer5, 9, kmer6);
+ Assert.assertEquals(text5+text6, Kmer.recoverKmerFrom(7+9, merged, 0, merged.length));
+
+ byte[] kmer7 = Kmer.compressKmer(6, array, 1);
+ String text7 = "GCTGAC";
+ merged = KmerUtil.mergeTwoKmer(7, kmer5, 6, kmer7);
+ Assert.assertEquals(text5+text7, Kmer.recoverKmerFrom(7+6, merged, 0, merged.length));
+
+ }
+
+}