add mergeNext function to kmerWritable
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index e548ded..f237afb 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -309,7 +309,7 @@
* : next neighbor in gene-code format
* @return the merged Kmer, this K of this Kmer is k+1
*/
- public void mergeKmerWithNextCode(byte nextCode) {
+ public void mergeNextCode(byte nextCode) {
this.kmerlength += 1;
setSize(KmerUtil.getByteNumFromK(kmerlength));
if (kmerlength % 4 == 1) {
@@ -318,7 +318,48 @@
}
bytes[offset] = (byte) (nextCode & 0x3);
} else {
- bytes[offset] = (byte) (bytes[offset] | ((nextCode & 0x3) << (((kmerlength-1) % 4) << 1)));
+ bytes[offset] = (byte) (bytes[offset] | ((nextCode & 0x3) << (((kmerlength - 1) % 4) << 1)));
+ }
+ clearLeadBit();
+ }
+
+ /**
+ * Merge Kmer with the next connected Kmer
+ * e.g. AAGCTAA merge with AACAACC, if the initial kmerSize = 3
+ * then it will return AAGCTAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param kmer
+ */
+ @SuppressWarnings("unchecked")
+ public void mergeNextKmer(int initialKmerSize, KmerBytesWritable kmer) {
+ if (kmer.getKmerLength() == initialKmerSize){
+ mergeNextCode(kmer.getGeneCodeAtPosition(kmer.getKmerLength()-1));
+ return;
+ }
+ int preKmerLength = kmerlength;
+ int preSize = size;
+ this.kmerlength += kmer.kmerlength - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(kmerlength));
+ int i = 1;
+ for (; i <= preSize; i++) {
+ bytes[offset + size - i] = bytes[offset + preSize - i];
+ }
+ if (i > 1) {
+ i--;
+ }
+ if (preKmerLength % 4 == 0) {
+ for (int j = 1; j <= kmer.getLength() && offset + size >= i + j; j++) {
+ bytes[offset + size - i - j] = kmer.getBytes()[kmer.getOffset() + kmer.getLength() - j];
+ }
+ } else {
+ int posNeedToMove = ((preKmerLength % 4) << 1);
+ bytes[offset + size - i] |= kmer.getBytes()[kmer.getOffset() + kmer.getLength() - 1] << posNeedToMove;
+ for (int j = 1; j <= kmer.getLength() && offset + size - i - j >= 0; j++) {
+ bytes[offset + size - i - j] = (byte) (((kmer.getBytes()[kmer.getOffset() + kmer.getLength() - j] & 0xff) >> (8 - posNeedToMove)) | (kmer
+ .getBytes()[kmer.getOffset() + kmer.getLength() - j - 1] << posNeedToMove));
+ }
}
clearLeadBit();
}
@@ -391,4 +432,5 @@
static { // register this comparator
WritableComparator.define(KmerBytesWritable.class, new Comparator());
}
+
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
index 0276c3a..daf4a4f 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
@@ -16,8 +16,8 @@
private PositionListWritable incomingList;
private PositionListWritable outgoingList;
private KmerBytesWritable kmer;
-
- public NodeWritable(){
+
+ public NodeWritable() {
nodeID = new PositionWritable();
incomingList = new PositionListWritable();
outgoingList = new PositionListWritable();
@@ -31,10 +31,6 @@
kmer = new KmerBytesWritable(kmerSize);
}
- public int getCount() {
- return kmer.getKmerLength();
- }
-
public void setNodeID(PositionWritable ref) {
this.setNodeID(ref.getReadID(), ref.getPosInRead());
}
@@ -74,9 +70,13 @@
return kmer;
}
- public void mergeNextWithinOneRead(NodeWritable nextNodeEntry) {
- this.outgoingList.set(nextNodeEntry.outgoingList);
- kmer.mergeKmerWithNextCode(nextNodeEntry.kmer.getGeneCodeAtPosition(nextNodeEntry.getCount() - 1));
+ public int getCount() {
+ return kmer.getKmerLength();
+ }
+
+ public void mergeNext(NodeWritable nextNode, int initialKmerSize) {
+ this.outgoingList.set(nextNode.outgoingList);
+ kmer.mergeNextKmer(initialKmerSize, nextNode.getKmer());
}
public void set(NodeWritable node) {
@@ -111,9 +111,9 @@
public int hashCode() {
return nodeID.hashCode();
}
-
+
@Override
- public String toString(){
+ public String toString() {
StringBuilder sbuilder = new StringBuilder();
sbuilder.append('(');
sbuilder.append(nodeID.toString()).append('\t');
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
index b7dd8f1..04fc3bb 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
@@ -59,7 +59,7 @@
Assert.assertEquals(kmer.toString(), "AATAGAA");
kmer.setByReadReverse(array, 1);
- Assert.assertEquals(kmer.toString(), "GAAGATA");
+ Assert.assertEquals(kmer.toString(), "CTTCTAT");
}
@Test
@@ -100,11 +100,35 @@
String text = "AGCTGACCG";
for (int i = 0; i < 10; i++) {
for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
- kmer.mergeKmerWithNextCode(x);
+ kmer.mergeNextCode(x);
text = text + (char) GeneCode.GENE_SYMBOL[x];
Assert.assertEquals(text, kmer.toString());
}
}
}
+
+ @Test
+ public void TestMergeNextKmer(){
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ KmerBytesWritable kmer1 = new KmerBytesWritable(9);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACCG";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(9);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCGT";
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+
+ KmerBytesWritable merged = new KmerBytesWritable(kmer1);
+ merged.mergeNextKmer(9, kmer2);
+ Assert.assertEquals("AGCTGACCGT", merged);
+
+ KmerBytesWritable kmer3 = new KmerBytesWritable(3);
+ kmer3.setByRead(array, 1);
+ String text3 = "GCT";
+ Assert.assertEquals(text3, kmer3.toString());
+ merged.mergeNextKmer(1, kmer3);
+ Assert.assertEquals(text1 + text3, merged.toString());
+ }
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java
index d81c595..d80ed13 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java
@@ -115,7 +115,7 @@
if (nextNodeEntry.getOutgoingList().getCountOfPosition() == 0) {
if (pNextNext == null || pNextNext.getOutgoingList().getCountOfPosition() == 0) {
- curNodeEntry.mergeNextWithinOneRead(nextNodeEntry);
+ curNodeEntry.mergeNext(nextNodeEntry, kmerSize);
} else {
curNodeEntry.getOutgoingList().reset();
curNodeEntry.getOutgoingList().append(nextNodeEntry.getNodeID());