add KmerUtil for merge or join operation

git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@3306 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
new file mode 100644
index 0000000..83f7f32
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
@@ -0,0 +1,154 @@
+package edu.uci.ics.genomix.type;
+
+public class KmerUtil {
+
+	public static int countNumberOfBitSet(int i) {
+		int c = 0;
+		for (; i != 0; c++) {
+			i &= i - 1;
+		}
+		return c;
+	}
+
+	public static int inDegree(byte bitmap) {
+		return countNumberOfBitSet((bitmap >> 4) & 0x0f);
+	}
+
+	public static int outDegree(byte bitmap) {
+		return countNumberOfBitSet(bitmap & 0x0f);
+	}
+
+	/**
+	 * Get last kmer from kmer-chain. e.g. kmerChain is AAGCTA, if k =5, it will
+	 * return AGCTA
+	 * 
+	 * @param k
+	 * @param kInChain
+	 * @param kmerChain
+	 * @return
+	 */
+	public static byte[] getLastKmerFromChain(int k, int kInChain,
+			byte[] kmerChain) {
+		if (k > kInChain) {
+			return null;
+		}
+		if (k == kInChain) {
+			return kmerChain.clone();
+		}
+		int byteNum = Kmer.getByteNumFromK(k);
+		byte[] kmer = new byte[byteNum];
+
+		/** from end to start */
+		int byteInChain = kmerChain.length - 1 - (kInChain - k) / 4;
+		int posInByteOfChain = ((kInChain - k) % 4) << 1; // *2
+		int byteInKmer = byteNum - 1;
+		for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+			kmer[byteInKmer] = (byte) ((0xff & kmerChain[byteInChain]) >> posInByteOfChain);
+			kmer[byteInKmer] |= ((kmerChain[byteInChain - 1] << (8 - posInByteOfChain)));
+		}
+
+		/** last kmer byte */
+		if (byteInKmer == 0) {
+			kmer[0] = (byte) ((kmerChain[0] & 0xff) >> posInByteOfChain);
+		}
+		return kmer;
+	}
+
+	/**
+	 * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
+	 * return AAGCT
+	 * 
+	 * @param k
+	 * @param kInChain
+	 * @param kmerChain
+	 * @return
+	 */
+	public static byte[] getFirstKmerFromChain(int k, int kInChain,
+			byte[] kmerChain) {
+		if (k > kInChain) {
+			return null;
+		}
+		if (k == kInChain) {
+			return kmerChain.clone();
+		}
+		int byteNum = Kmer.getByteNumFromK(k);
+		byte[] kmer = new byte[byteNum];
+
+		int i = 1;
+		for (; i < kmer.length; i++) {
+			kmer[kmer.length - i] = kmerChain[kmerChain.length - i];
+		}
+		int posInByteOfChain = (k % 4) << 1; // *2
+		if (posInByteOfChain == 0) {
+			kmer[0] = kmerChain[kmerChain.length - i];
+		} else {
+			kmer[0] = (byte) (kmerChain[kmerChain.length - i] & ((1 << posInByteOfChain) - 1));
+		}
+		return kmer;
+	}
+
+	public static byte[] mergeKmerWithNextCode(int k, byte[] kmer, byte nextCode) {
+		int byteNum = kmer.length;
+		if (k % 4 == 0) {
+			byteNum++;
+		}
+		byte[] mergedKmer = new byte[byteNum];
+		for (int i = 1; i <= kmer.length; i++) {
+			mergedKmer[mergedKmer.length - i] = kmer[kmer.length - i];
+		}
+		if (mergedKmer.length > kmer.length) {
+			mergedKmer[0] = (byte) (nextCode & 0x3);
+		} else {
+			mergedKmer[0] = (byte) (kmer[0] | ((nextCode & 0x3) << ((k % 4) << 1)));
+		}
+		return mergedKmer;
+	}
+
+	public static byte[] mergeKmerWithPreCode(int k, byte[] kmer, byte preCode) {
+		int byteNum = kmer.length;
+		byte[] mergedKmer = null;
+		int byteInMergedKmer = 0;
+		if (k % 4 == 0) {
+			byteNum++;
+			mergedKmer = new byte[byteNum];
+			mergedKmer[0] = (byte) ((kmer[0] >> 6) & 0x3);
+			byteInMergedKmer++;
+		} else {
+			mergedKmer = new byte[byteNum];
+		}
+		for (int i = 0; i < kmer.length - 1; i++, byteInMergedKmer++) {
+			mergedKmer[byteInMergedKmer] = (byte) ((kmer[i] << 2) | ((kmer[i + 1] >> 6) & 0x3));
+		}
+		mergedKmer[byteInMergedKmer] = (byte) ((kmer[kmer.length - 1] << 2) | (preCode & 0x3));
+		return mergedKmer;
+	}
+
+	public static byte[] mergeTwoKmer(int preK, byte[] kmerPre, int nextK,
+			byte[] kmerNext) {
+		int byteNum = Kmer.getByteNumFromK(preK + nextK);
+		byte[] mergedKmer = new byte[byteNum];
+		int i = 1;
+		for (; i <= kmerPre.length; i++) {
+			mergedKmer[byteNum - i] = kmerPre[kmerPre.length - i];
+		}
+		i--;
+		if (preK % 4 == 0) {
+			for (int j = 1; j <= kmerNext.length; j++) {
+				mergedKmer[byteNum - i - j] = kmerNext[kmerNext.length - j];
+			}
+		} else {
+			int posNeedToMove = ((preK % 4) << 1);
+			mergedKmer[byteNum - i] |= kmerNext[kmerNext.length - 1] << posNeedToMove;
+			for (int j = 1; j < kmerNext.length; j++) {
+				mergedKmer[byteNum - i - j] = (byte) (((kmerNext[kmerNext.length
+						- j] & 0xff) >> (8 - posNeedToMove)) | (kmerNext[kmerNext.length
+						- j - 1] << posNeedToMove));
+			}
+			if ( (nextK % 4) * 2 + posNeedToMove > 8) {
+				mergedKmer[0] = (byte) (kmerNext[0] >> (8 - posNeedToMove));
+			}
+		}
+		return mergedKmer;
+	}
+
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
new file mode 100644
index 0000000..58e4b22
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerUtilTest.java
@@ -0,0 +1,111 @@
+package edu.uci.ics.genomix.example.kmer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.KmerUtil;
+
+public class KmerUtilTest {
+	static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C','G','T'};
+	
+	@Test
+	public void TestDegree(){
+		Assert.assertTrue(KmerUtil.inDegree((byte) 0xff) == 4); 
+		Assert.assertTrue(KmerUtil.outDegree((byte) 0xff) == 4);
+		Assert.assertTrue(KmerUtil.inDegree((byte) 0x3f) == 2);
+		Assert.assertTrue(KmerUtil.outDegree((byte) 0x01) == 1);
+		Assert.assertTrue(KmerUtil.inDegree((byte) 0x01) == 0);
+	}
+	
+	@Test
+	public void TestGetLastKmer(){
+		byte[] kmerChain = Kmer.compressKmer(9, array, 0);
+		Assert.assertEquals("AGCTGACCG", Kmer.recoverKmerFrom(9, kmerChain, 0, kmerChain.length));
+		byte[] lastKmer ;
+		for(int i = 8; i>0 ; i--){
+			lastKmer = KmerUtil.getLastKmerFromChain(i, 9, kmerChain);
+//			System.out.println(Kmer.recoverKmerFrom(i, lastKmer, 0, lastKmer.length));
+			Assert.assertEquals("AGCTGACCG".substring(9-i), Kmer.recoverKmerFrom(i, lastKmer, 0, lastKmer.length));
+		}
+	}
+	
+	@Test
+	public void TestMergeNext(){
+		byte[] kmer = Kmer.compressKmer(9, array, 0);
+		String text = "AGCTGACCG";
+		Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
+		for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+			kmer = KmerUtil.mergeKmerWithNextCode(9+x, kmer, x);
+//			System.out.println(Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+			text = text + (char)Kmer.GENE_SYMBOL[x];
+			Assert.assertEquals(text, Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+		}
+		for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+			kmer = KmerUtil.mergeKmerWithNextCode(13+x, kmer, x);
+//			System.out.println(Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+			text = text + (char)Kmer.GENE_SYMBOL[x];
+			Assert.assertEquals(text, Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+		}
+	}
+	
+	@Test
+	public void TestMergePre(){
+		byte[] kmer = Kmer.compressKmer(9, array, 0);
+		String text = "AGCTGACCG";
+		Assert.assertEquals(text, Kmer.recoverKmerFrom(9, kmer, 0, kmer.length));
+		for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+			kmer = KmerUtil.mergeKmerWithPreCode(9+x, kmer, x);
+//			System.out.println(Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+			text = (char)Kmer.GENE_SYMBOL[x] + text;
+			Assert.assertEquals(text , Kmer.recoverKmerFrom(9+x+1, kmer, 0, kmer.length));
+		}
+		for(byte x = Kmer.GENE_CODE.A; x<= Kmer.GENE_CODE.T ; x++){
+			kmer = KmerUtil.mergeKmerWithPreCode(13+x, kmer, x);
+//			System.out.println(Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+			text = (char)Kmer.GENE_SYMBOL[x] + text;
+			Assert.assertEquals(text , Kmer.recoverKmerFrom(13+x+1, kmer, 0, kmer.length));
+		}
+	}
+	
+	@Test
+	public void TestMergeTwoKmer(){
+		byte[] kmer1 = Kmer.compressKmer(9, array, 0);
+		String text1 = "AGCTGACCG";
+		byte[] kmer2 = Kmer.compressKmer(9, array, 1);
+		String text2 = "GCTGACCGT";
+		Assert.assertEquals(text1, Kmer.recoverKmerFrom(9, kmer1, 0, kmer1.length));
+		Assert.assertEquals(text2, Kmer.recoverKmerFrom(9, kmer2, 0, kmer2.length));
+		
+		byte[] merged = KmerUtil.mergeTwoKmer(9, kmer1, 9, kmer2);
+		Assert.assertEquals(text1+text2, Kmer.recoverKmerFrom(9+9, merged, 0, merged.length));
+		
+		byte[] kmer3 = Kmer.compressKmer(3, array, 1);
+		String text3 = "GCT";
+		Assert.assertEquals(text3, Kmer.recoverKmerFrom(3, kmer3, 0, kmer3.length));
+		merged = KmerUtil.mergeTwoKmer(9, kmer1, 3, kmer3);
+		Assert.assertEquals(text1+text3, Kmer.recoverKmerFrom(9+3, merged, 0, merged.length));
+		merged = KmerUtil.mergeTwoKmer(3, kmer3, 9, kmer1);
+		Assert.assertEquals(text3+text1, Kmer.recoverKmerFrom(9+3, merged, 0, merged.length));
+		
+		byte[] kmer4 = Kmer.compressKmer(8, array, 0);
+		String text4 = "AGCTGACC";
+		Assert.assertEquals(text4, Kmer.recoverKmerFrom(8, kmer4, 0, kmer4.length));
+		merged = KmerUtil.mergeTwoKmer(8, kmer4, 3, kmer3);
+		Assert.assertEquals(text4+text3, Kmer.recoverKmerFrom(8+3, merged, 0, merged.length));
+		
+		byte[] kmer5 = Kmer.compressKmer(7, array, 0);
+		String text5 = "AGCTGAC";
+		byte[] kmer6 = Kmer.compressKmer(9, array, 1);
+		String text6 = "GCTGACCGT";
+		merged = KmerUtil.mergeTwoKmer(7, kmer5, 9, kmer6);
+		Assert.assertEquals(text5+text6, Kmer.recoverKmerFrom(7+9, merged, 0, merged.length));
+		
+		byte[] kmer7 = Kmer.compressKmer(6, array, 1);
+		String text7 = "GCTGAC";
+		merged = KmerUtil.mergeTwoKmer(7, kmer5, 6, kmer7);
+		Assert.assertEquals(text5+text7, Kmer.recoverKmerFrom(7+6, merged, 0, merged.length));
+
+	}
+	
+}