Merge commit '94e075b5c3db9aa613ef61c2581430a143b17bc8' into nanzhang/hyracks_genomix
Conflicts:
genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerListWritableTest.java
genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MessageWritable.java
genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java
genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicGraphCleanVertex.java
genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P1ForPathMergeVertex.java
genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P3ForPathMergeVertex.java
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/oldtype/IntermediateNodeWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/oldtype/IntermediateNodeWritable.java
deleted file mode 100644
index 3826f86..0000000
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/oldtype/IntermediateNodeWritable.java
+++ /dev/null
@@ -1,144 +0,0 @@
-package edu.uci.ics.genomix.oldtype;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.io.Serializable;
-
-import org.apache.hadoop.io.WritableComparable;
-
-import edu.uci.ics.genomix.type.KmerListWritable;
-import edu.uci.ics.genomix.type.PositionWritable;
-
-public class IntermediateNodeWritable implements WritableComparable<IntermediateNodeWritable>, Serializable{
-
- private static final long serialVersionUID = 1L;
- public static final IntermediateNodeWritable EMPTY_NODE = new IntermediateNodeWritable();
-
- private KmerListWritable forwardForwardList;
- private KmerListWritable forwardReverseList;
- private KmerListWritable reverseForwardList;
- private KmerListWritable reverseReverseList;
- private PositionWritable nodeId;
-
- public IntermediateNodeWritable(){
- forwardForwardList = new KmerListWritable();
- forwardReverseList = new KmerListWritable();
- reverseForwardList = new KmerListWritable();
- reverseReverseList = new KmerListWritable();
- nodeId = new PositionWritable();
- }
-
- public IntermediateNodeWritable(KmerListWritable FFList, KmerListWritable FRList,
- KmerListWritable RFList, KmerListWritable RRList, PositionWritable uniqueKey) {
- this();
- set(FFList, FRList, RFList, RRList, uniqueKey);
- }
-
- public void set(IntermediateNodeWritable node){
- set(node.forwardForwardList, node.forwardReverseList, node.reverseForwardList,
- node.reverseReverseList, node.nodeId);
- }
-
- public void set(KmerListWritable FFList, KmerListWritable FRList,
- KmerListWritable RFList, KmerListWritable RRList, PositionWritable uniqueKey) {
- this.forwardForwardList.set(FFList);
- this.forwardReverseList.set(FRList);
- this.reverseForwardList.set(RFList);
- this.reverseReverseList.set(RRList);
- this.nodeId.set(uniqueKey);
- }
-
- public KmerListWritable getFFList() {
- return forwardForwardList;
- }
-
- public void setFFList(KmerListWritable forwardForwardList) {
- this.forwardForwardList.set(forwardForwardList);
- }
-
- public KmerListWritable getFRList() {
- return forwardReverseList;
- }
-
- public void setFRList(KmerListWritable forwardReverseList) {
- this.forwardReverseList.set(forwardReverseList);
- }
-
- public KmerListWritable getRFList() {
- return reverseForwardList;
- }
-
- public void setRFList(KmerListWritable reverseForwardList) {
- this.reverseForwardList.set(reverseForwardList);
- }
-
- public KmerListWritable getRRList() {
- return reverseReverseList;
- }
-
- public void setRRList(KmerListWritable reverseReverseList) {
- this.reverseReverseList.set(reverseReverseList);
- }
-
- public PositionWritable getNodeId() {
- return nodeId;
- }
-
- public void setNodeId(PositionWritable nodeId) {
- this.nodeId.set(nodeId);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- this.forwardForwardList.readFields(in);
- this.forwardReverseList.readFields(in);
- this.reverseForwardList.readFields(in);
- this.reverseReverseList.readFields(in);
- this.nodeId.readFields(in);
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- this.forwardForwardList.write(out);
- this.forwardReverseList.write(out);
- this.reverseForwardList.write(out);
- this.reverseReverseList.write(out);
- this.nodeId.write(out);
- }
-
- @Override
- public int compareTo(IntermediateNodeWritable other) {
- // TODO Auto-generated method stub
- return this.nodeId.compareTo(other.nodeId);
- }
-
- @Override
- public int hashCode() {
- return this.nodeId.hashCode();
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof IntermediateNodeWritable) {
- IntermediateNodeWritable nw = (IntermediateNodeWritable) o;
- return (this.forwardForwardList.equals(nw.forwardForwardList)
- && this.forwardReverseList.equals(nw.forwardReverseList)
- && this.reverseForwardList.equals(nw.reverseForwardList)
- && this.reverseReverseList.equals(nw.reverseReverseList) && (this.nodeId.equals(nw.nodeId)));
- }
- return false;
- }
-
- @Override
- public String toString() {
- StringBuilder sbuilder = new StringBuilder();
- sbuilder.append('(');
- sbuilder.append(nodeId.toString()).append('\t');
- sbuilder.append(forwardForwardList.toString()).append('\t');
- sbuilder.append(forwardReverseList.toString()).append('\t');
- sbuilder.append(reverseForwardList.toString()).append('\t');
- sbuilder.append(reverseReverseList.toString()).append('\t').append(')');
- return sbuilder.toString();
- }
-}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index 65233d8..e042840 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -25,192 +25,137 @@
import org.apache.hadoop.io.WritableComparator;
import edu.uci.ics.genomix.data.KmerUtil;
-import edu.uci.ics.genomix.oldtype.NodeWritable.DirectionFlag;
+import edu.uci.ics.genomix.data.Marshal;
/**
- * Variable kmer length byteswritable
- * It was used to generate the graph in which phase the kmer length doesn't change.
- * Thus the kmerByteSize of bytes doesn't change either.
+ * Fixed, static-length Kmer used as the key and edge values of each
+ * NodeWritable. Kmer length should be set once during configuration and should
+ * never change.
*/
public class KmerBytesWritable extends BinaryComparable implements Serializable, WritableComparable<BinaryComparable> {
- /**
- *
- */
- private static final long serialVersionUID = 1L;
- private static final byte[] EMPTY_BYTES = {};
- public int kmerByteSize;
+ private static final long serialVersionUID = 1L;
+ protected static final byte[] EMPTY_BYTES = {};
+
+ protected static int lettersInKmer;
+ protected static int bytesUsed;
protected byte[] bytes;
protected int offset;
- protected int kmerlength;
- public KmerBytesWritable() {
- this(0, EMPTY_BYTES, 0);
- }
-
- public KmerBytesWritable(int k, byte[] storage, int offset) {
- setNewReference(k, storage, offset);
- }
-
- public KmerBytesWritable(int k, String kmer) {
- setNewReference(kmer.length(), kmer.getBytes(), 0);
+ /**
+ * set the *GLOBAL* kmer length to the given k value.
+ * NOTE: this will invalidate ALL previously created kmers. This function
+ * should be called before any kmers are created
+ */
+ public static void setGlobalKmerLength(int k) {
+ bytesUsed = KmerUtil.getByteNumFromK(k);
+ lettersInKmer = k;
}
/**
- * Initial Kmer space by kmerlength
- *
- * @param k
- * kmerlength
+ * Initialize as empty kmer
*/
- public KmerBytesWritable(int k) {
- this.kmerlength = k;
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- if (k > 0) {
- this.bytes = new byte[this.kmerByteSize];
- } else {
- this.bytes = EMPTY_BYTES;
- }
- this.offset = 0;
+ public KmerBytesWritable() {
+ bytes = new byte[bytesUsed];
+ offset = 0;
}
- public KmerBytesWritable(KmerBytesWritable right) {
- this(right.kmerlength);
- set(right);
+ /**
+ * Copy contents of kmer string
+ */
+ public KmerBytesWritable(String kmer) {
+ this();
+ setByRead(kmer.getBytes(), 0);
+ }
+
+ /**
+ * Set as reference to existing data
+ */
+ public KmerBytesWritable(byte[] storage, int offset) {
+ setAsReference(storage, offset);
+ }
+
+ /**
+ * copy kmer in other
+ *
+ * @param other
+ */
+ public KmerBytesWritable(KmerBytesWritable other) {
+ this();
+ setAsCopy(other);
}
/**
* Deep copy of the given kmer
*
- * @param newData
+ * @param other
*/
- public void set(KmerBytesWritable newData) {
- if (newData == null) {
- this.set(0, EMPTY_BYTES, 0);
- } else {
- this.set(newData.kmerlength, newData.bytes, newData.getOffset());
+ public void setAsCopy(KmerBytesWritable other) {
+ if (lettersInKmer > 0) {
+ System.arraycopy(other.bytes, other.offset, bytes, this.offset, bytesUsed);
}
}
/**
* Deep copy of the given bytes data
- * It will not change the kmerlength
*
* @param newData
* @param offset
*/
- public void set(byte[] newData, int offset) {
- if (kmerlength > 0) {
- System.arraycopy(newData, offset, bytes, this.offset, kmerByteSize);
+ public void setAsCopy(byte[] newData, int offset) {
+ if (newData.length - offset < bytesUsed) {
+ throw new IllegalArgumentException("Requested " + bytesUsed + " bytes (k=" + lettersInKmer
+ + ") but buffer has only " + (newData.length - offset) + " bytes");
}
+ System.arraycopy(newData, offset, bytes, this.offset, bytesUsed);
}
/**
- * Deep copy of the given data, and also set to new kmerlength
+ * Point this datablock to the given bytes array It works like the pointer
+ * to new datablock.
*
- * @param k
- * : new kmer length
* @param newData
- * : data storage
* @param offset
- * : start offset
*/
- public void set(int k, byte[] newData, int offset) {
- reset(k);
- if (k > 0) {
- System.arraycopy(newData, offset, bytes, this.offset, kmerByteSize);
+ public void setAsReference(byte[] newData, int offset) {
+ if (newData.length - offset < bytesUsed) {
+ throw new IllegalArgumentException("Requested " + bytesUsed + " bytes (k=" + lettersInKmer
+ + ") but buffer has only " + (newData.length - offset) + " bytes");
}
- }
-
- /**
- * Reset array by kmerlength
- *
- * @param k
- */
- public void reset(int k) {
- this.kmerlength = k;
- setSize(KmerUtil.getByteNumFromK(k));
- clearLeadBit();
- }
-
- /**
- * Point this datablock to the given bytes array
- * It works like the pointer to new datablock.
- * kmerlength will not change
- *
- * @param newData
- * @param offset
- */
- public void setNewReference(byte[] newData, int offset) {
- this.bytes = newData;
+ bytes = newData;
this.offset = offset;
- if (newData.length - offset < kmerByteSize) {
- throw new IllegalArgumentException("Not given enough space");
- }
}
/**
- * Point this datablock to the given bytes array
- * It works like the pointer to new datablock.
- * It also set the new kmerlength
- *
- * @param k
- * @param newData
- * @param offset
- */
- public void setNewReference(int k, byte[] newData, int offset) {
- this.kmerlength = k;
- this.kmerByteSize = KmerUtil.getByteNumFromK(k);
- setNewReference(newData, offset);
- }
-
- protected void setSize(int size) {
- if (size > getCapacity()) {
- setCapacity((size * 3 / 2));
- }
- this.kmerByteSize = size;
- }
-
- protected int getCapacity() {
- return bytes.length;
- }
-
- protected void setCapacity(int new_cap) {
- if (new_cap != getCapacity()) {
- byte[] new_data = new byte[new_cap];
- if (new_cap < kmerByteSize) {
- kmerByteSize = new_cap;
- }
- if (kmerByteSize != 0) {
- System.arraycopy(bytes, offset, new_data, 0, kmerByteSize);
- }
- bytes = new_data;
- offset = 0;
- }
- }
-
- /**
- * Get one genecode (A|G|C|T) from the given kmer index
- * e.g. Get the 4th gene of the kmer ACGTA will return T
+ * Get one genecode (A|G|C|T) from the given kmer index e.g. Get the 4th
+ * gene of the kmer ACGTA will return T
*
* @param pos
* @return
*/
public byte getGeneCodeAtPosition(int pos) {
- if (pos >= kmerlength) {
- throw new IllegalArgumentException("gene position out of bound");
+ if (pos >= lettersInKmer || pos < 0) {
+ throw new ArrayIndexOutOfBoundsException("Gene position (" + pos + ") out of bounds for k=" + lettersInKmer);
}
return geneCodeAtPosition(pos);
}
-
- // unchecked version of above. Used when kmerlength is inaccurate (mid-merge)
+
+ /**
+ * unchecked version of getGeneCodeAtPosition. Used when kmerlength is
+ * inaccurate (mid-merge)
+ */
private byte geneCodeAtPosition(int pos) {
int posByte = pos / 4;
int shift = (pos % 4) << 1;
- return (byte) ((bytes[offset + kmerByteSize - 1 - posByte] >> shift) & 0x3);
+ return (byte) ((bytes[offset + bytesUsed - 1 - posByte] >> shift) & 0x3);
+ }
+
+ public static int getKmerLength() {
+ return lettersInKmer;
}
- public int getKmerLength() {
- return this.kmerlength;
+ public static int getBytesPerKmer() {
+ return bytesUsed;
}
@Override
@@ -224,23 +169,22 @@
@Override
public int getLength() {
- return kmerByteSize;
+ return bytesUsed;
}
/**
* Read Kmer from read text into bytes array e.g. AATAG will compress as
* [0x000G, 0xATAA]
*
- * @param k
- * @param array
+ * @param stringBytes
* @param start
*/
- public void setByRead(byte[] array, int start) {
+ public void setByRead(byte[] stringBytes, int start) {
byte l = 0;
int bytecount = 0;
- int bcount = this.kmerByteSize - 1;
- for (int i = start; i < start + kmerlength && i < array.length; i++) {
- byte code = GeneCode.getCodeFromSymbol(array[i]);
+ int bcount = this.bytesUsed - 1;
+ for (int i = start; i < start + lettersInKmer && i < stringBytes.length; i++) {
+ byte code = GeneCode.getCodeFromSymbol(stringBytes[i]);
l |= (byte) (code << bytecount);
bytecount += 2;
if (bytecount == 8) {
@@ -254,15 +198,9 @@
}
}
- public void setByRead(int k, byte[] array, int start) {
- reset(k);
- setByRead(array, start);
- }
-
/**
- * Compress Reversed read into bytes array
- * e.g. AATAG will paired to CTATT, and then compress as
- * [0x000T,0xTATC]
+ * Compress Reversed read into bytes array e.g. AATAG will paired to CTATT,
+ * and then compress as [0x000T,0xTATC]
*
* @param input
* array
@@ -272,9 +210,10 @@
public void setByReadReverse(byte[] array, int start) {
byte l = 0;
int bytecount = 0;
- int bcount = kmerByteSize - 1;
-// for (int i = start + kmerlength - 1; i >= 0 && i < array.length; i--) {
- for (int i = start + kmerlength - 1; i >= start && i < array.length; i--) {
+ int bcount = bytesUsed - 1;
+ // for (int i = start + kmerlength - 1; i >= 0 && i < array.length; i--)
+ // {
+ for (int i = start + lettersInKmer - 1; i >= start && i < array.length; i--) {
byte code = GeneCode.getPairedCodeFromSymbol(array[i]);
l |= (byte) (code << bytecount);
bytecount += 2;
@@ -289,11 +228,6 @@
}
}
- public void setByReadReverse(int k, byte[] array, int start) {
- reset(k);
- setByReadReverse(array, start);
- }
-
/**
* Shift Kmer to accept new char input
*
@@ -313,12 +247,12 @@
* @return the shift out gene, in gene code format
*/
public byte shiftKmerWithNextCode(byte c) {
- byte output = (byte) (bytes[offset + kmerByteSize - 1] & 0x03);
- for (int i = kmerByteSize - 1; i > 0; i--) {
+ byte output = (byte) (bytes[offset + bytesUsed - 1] & 0x03);
+ for (int i = bytesUsed - 1; i > 0; i--) {
byte in = (byte) (bytes[offset + i - 1] & 0x03);
bytes[offset + i] = (byte) (((bytes[offset + i] >>> 2) & 0x3f) | (in << 6));
}
- int pos = ((kmerlength - 1) % 4) << 1;
+ int pos = ((lettersInKmer - 1) % 4) << 1;
byte code = (byte) (c << pos);
bytes[offset] = (byte) (((bytes[offset] >>> 2) & 0x3f) | code);
clearLeadBit();
@@ -344,152 +278,17 @@
* @return the shiftout gene, in gene code format
*/
public byte shiftKmerWithPreCode(byte c) {
- int pos = ((kmerlength - 1) % 4) << 1;
+ int pos = ((lettersInKmer - 1) % 4) << 1;
byte output = (byte) ((bytes[offset] >> pos) & 0x03);
- for (int i = 0; i < kmerByteSize - 1; i++) {
+ for (int i = 0; i < bytesUsed - 1; i++) {
byte in = (byte) ((bytes[offset + i + 1] >> 6) & 0x03);
bytes[offset + i] = (byte) ((bytes[offset + i] << 2) | in);
}
- bytes[offset + kmerByteSize - 1] = (byte) ((bytes[offset + kmerByteSize - 1] << 2) | c);
+ bytes[offset + bytesUsed - 1] = (byte) ((bytes[offset + bytesUsed - 1] << 2) | c);
clearLeadBit();
return output;
}
- /**
- * Merge Kmer with the next connected Kmer
- * e.g. AAGCTAA merge with AACAACC, if the initial kmerSize = 3
- * then it will return AAGCTAACAACC
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param kmer
- * : the next kmer
- */
- public void mergeWithFFKmer(int initialKmerSize, KmerBytesWritable kmer) {
- int preKmerLength = kmerlength;
- int preSize = kmerByteSize;
- this.kmerlength += kmer.kmerlength - initialKmerSize + 1;
- setSize(KmerUtil.getByteNumFromK(kmerlength));
- for (int i = 1; i <= preSize; i++) {
- bytes[offset + kmerByteSize - i] = bytes[offset + preSize - i];
- }
- for (int k = initialKmerSize - 1; k < kmer.getKmerLength(); k += 4) {
- byte onebyte = getOneByteFromKmerAtPosition(k, kmer.getBytes(), kmer.getOffset(), kmer.getLength());
- appendOneByteAtPosition(preKmerLength + k - initialKmerSize + 1, onebyte, bytes, offset, kmerByteSize);
- }
- clearLeadBit();
- }
-
- /**
- * Merge Kmer with the next connected Kmer, when that Kmer needs to be reverse-complemented
- * e.g. AAGCTAA merge with GGTTGTT, if the initial kmerSize = 3
- * then it will return AAGCTAACAACC
- *
- * A merge B => A B~
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param kmer
- * : the next kmer
- */
- public void mergeWithFRKmer(int initialKmerSize, KmerBytesWritable kmer) {
- int preSize = kmerByteSize;
- int preKmerLength = kmerlength;
- this.kmerlength += kmer.kmerlength - initialKmerSize + 1;
- setSize(KmerUtil.getByteNumFromK(kmerlength));
- // copy prefix into right-side of buffer
- for (int i = 1; i <= preSize; i++) {
- bytes[offset + kmerByteSize - i] = bytes[offset + preSize - i];
- }
-
- int bytecount = (preKmerLength % 4) * 2;
- int bcount = kmerByteSize - preSize - bytecount / 8; // may overlap previous kmer
- byte l = bcount == kmerByteSize - preSize ? bytes[offset + bcount] : 0x00;
- bytecount %= 8;
- for (int i = kmer.kmerlength - initialKmerSize; i >= 0; i--) {
- byte code = GeneCode.getPairedGeneCode(kmer.getGeneCodeAtPosition(i));
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[offset + bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[offset] = l;
- }
- }
-
- /**
- * Merge Kmer with the previous connected Kmer, when that kmer needs to be reverse-complemented
- * e.g. AACAACC merge with TTCTGCC, if the initial kmerSize = 3
- * then it will return GGCAGAACAACC
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param preKmer
- * : the previous kmer
- */
- public void mergeWithRFKmer(int initialKmerSize, KmerBytesWritable preKmer) {
- KmerBytesWritable reversed = new KmerBytesWritable(preKmer.kmerlength);
- reversed.setByReadReverse(preKmer.toString().getBytes(), 0);
- mergeWithRRKmer(initialKmerSize, reversed);
- }
-
- /**
- * Merge Kmer with the previous connected Kmer
- * e.g. AACAACC merge with AAGCTAA, if the initial kmerSize = 3
- * then it will return AAGCTAACAACC
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param preKmer
- * : the previous kmer
- */
- public void mergeWithRRKmer(int initialKmerSize, KmerBytesWritable preKmer) {
- int preKmerLength = kmerlength;
- int preSize = kmerByteSize;
- this.kmerlength += preKmer.kmerlength - initialKmerSize + 1;
- setSize(KmerUtil.getByteNumFromK(kmerlength));
- byte cacheByte = getOneByteFromKmerAtPosition(0, bytes, offset, preSize);
-
- // copy prekmer
- for (int k = 0; k < preKmer.kmerlength - initialKmerSize + 1; k += 4) {
- byte onebyte = getOneByteFromKmerAtPosition(k, preKmer.bytes, preKmer.offset, preKmer.kmerByteSize);
- appendOneByteAtPosition(k, onebyte, bytes, offset, kmerByteSize);
- }
-
- // copy current kmer
- int k = 4;
- for (; k < preKmerLength; k += 4) {
- byte onebyte = getOneByteFromKmerAtPosition(k, bytes, offset, preSize);
- appendOneByteAtPosition(preKmer.kmerlength - initialKmerSize + k - 4 + 1, cacheByte, bytes, offset, kmerByteSize);
- cacheByte = onebyte;
- }
- appendOneByteAtPosition(preKmer.kmerlength - initialKmerSize + k - 4 + 1, cacheByte, bytes, offset, kmerByteSize);
- clearLeadBit();
- }
-
- public void mergeWithKmerInDir(byte dir, int initialKmerSize, KmerBytesWritable kmer) {
- switch(dir & DirectionFlag.DIR_MASK) {
- case DirectionFlag.DIR_FF:
- mergeWithFFKmer(initialKmerSize, kmer);
- break;
- case DirectionFlag.DIR_FR:
- mergeWithFRKmer(initialKmerSize, kmer);
- break;
- case DirectionFlag.DIR_RF:
- mergeWithRFKmer(initialKmerSize, kmer);
- break;
- case DirectionFlag.DIR_RR:
- mergeWithRRKmer(initialKmerSize, kmer);
- break;
- default:
- throw new RuntimeException("Direciotn not recognized: " + dir);
- }
- }
-
public static void appendOneByteAtPosition(int k, byte onebyte, byte[] buffer, int start, int length) {
int position = start + length - 1 - k / 4;
if (position < start) {
@@ -518,63 +317,53 @@
}
protected void clearLeadBit() {
- if (kmerlength % 4 != 0) {
- bytes[offset] &= (1 << ((kmerlength % 4) << 1)) - 1;
+ if (lettersInKmer % 4 != 0) {
+ bytes[offset] &= (1 << ((lettersInKmer % 4) << 1)) - 1;
}
}
@Override
public void readFields(DataInput in) throws IOException {
- this.kmerlength = in.readInt();
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- if (this.kmerlength > 0) {
- if (this.bytes.length < this.kmerByteSize) {
- this.bytes = new byte[this.kmerByteSize];
- this.offset = 0;
- }
- in.readFully(bytes, offset, kmerByteSize);
- }
+ in.readFully(bytes, offset, bytesUsed);
}
@Override
public void write(DataOutput out) throws IOException {
- out.writeInt(kmerlength);
- if (kmerlength > 0) {
- out.write(bytes, offset, kmerByteSize);
- }
+ out.write(bytes, offset, bytesUsed);
}
@Override
public int hashCode() {
- return super.hashCode() * 31 + this.kmerlength;
+ return Marshal.hashBytes(bytes, offset, bytesUsed);
}
@Override
public boolean equals(Object right_obj) {
- if (right_obj instanceof KmerBytesWritable)
- return this.kmerlength == ((KmerBytesWritable) right_obj).kmerlength && super.equals(right_obj);
+ if (right_obj instanceof KmerBytesWritable) {
+ // since these may be backed by storage of different sizes, we have to manually check each byte
+ KmerBytesWritable right = (KmerBytesWritable) right_obj;
+ for (int i=0; i < bytesUsed; i++) {
+ if (bytes[offset + i] != right.bytes[right.offset + i]) {
+ return false;
+ }
+ }
+ return true;
+ }
return false;
}
@Override
public String toString() {
- return KmerUtil.recoverKmerFrom(this.kmerlength, this.getBytes(), offset, this.getLength());
+ return KmerUtil.recoverKmerFrom(lettersInKmer, bytes, offset, bytesUsed);
}
public static class Comparator extends WritableComparator {
- public final int LEAD_BYTES = 4;
-
public Comparator() {
super(KmerBytesWritable.class);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- int kmerlength1 = readInt(b1, s1);
- int kmerlength2 = readInt(b2, s2);
- if (kmerlength1 == kmerlength2) {
- return compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
- }
- return kmerlength1 - kmerlength2;
+ return compareBytes(b1, s1, l1, b2, s2, l2);
}
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
index d2e3a94..16df821 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
@@ -16,10 +16,10 @@
package edu.uci.ics.genomix.type;
public class KmerBytesWritableFactory {
- private KmerBytesWritable kmer;
+ private VKmerBytesWritable kmer;
public KmerBytesWritableFactory(int k) {
- kmer = new KmerBytesWritable(k);
+ kmer = new VKmerBytesWritable(k);
}
/**
@@ -30,7 +30,7 @@
* @param array
* @param start
*/
- public KmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
+ public VKmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
kmer.reset(k);
kmer.setByRead(array, start);
return kmer;
@@ -43,7 +43,7 @@
* @param array
* @param start
*/
- public KmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
+ public VKmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
kmer.reset(k);
kmer.setByReadReverse(array, start);
return kmer;
@@ -59,28 +59,28 @@
* @param kmerChain
* @return LastKmer bytes array
*/
- public KmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
- if (lastK > kmerChain.getKmerLength()) {
+ public VKmerBytesWritable getLastKmerFromChain(int lastK, final VKmerBytesWritable kmerChain) {
+ if (lastK > kmerChain.getKmerLetterLength()) {
return null;
}
- if (lastK == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
+ if (lastK == kmerChain.getKmerLetterLength()) {
+ kmer.setAsCopy(kmerChain);
return kmer;
}
kmer.reset(lastK);
/** from end to start */
- int byteInChain = kmerChain.getLength() - 1 - (kmerChain.getKmerLength() - lastK) / 4;
- int posInByteOfChain = ((kmerChain.getKmerLength() - lastK) % 4) << 1; // *2
- int byteInKmer = kmer.getLength() - 1;
+ int byteInChain = kmerChain.getKmerByteLength() - 1 - (kmerChain.getKmerLetterLength() - lastK) / 4;
+ int posInByteOfChain = ((kmerChain.getKmerLetterLength() - lastK) % 4) << 1; // *2
+ int byteInKmer = kmer.getKmerByteLength() - 1;
for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
- kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
- kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ kmer.getBytes()[byteInKmer + kmer.getKmerOffset()] = (byte) ((0xff & kmerChain.getBytes()[byteInChain + kmerChain.getKmerOffset()]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer + kmer.getKmerOffset()] |= ((kmerChain.getBytes()[byteInChain + kmerChain.getKmerOffset() - 1] << (8 - posInByteOfChain)));
}
/** last kmer byte */
if (byteInKmer == 0) {
- kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ kmer.getBytes()[0 + kmer.getKmerOffset()] = (byte) ((kmerChain.getBytes()[0 + kmerChain.getKmerOffset()] & 0xff) >> posInByteOfChain);
}
kmer.clearLeadBit();
return kmer;
@@ -95,52 +95,52 @@
* @param kmerChain
* @return FirstKmer bytes array
*/
- public KmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
- if (firstK > kmerChain.getKmerLength()) {
+ public VKmerBytesWritable getFirstKmerFromChain(int firstK, final VKmerBytesWritable kmerChain) {
+ if (firstK > kmerChain.getKmerLetterLength()) {
return null;
}
- if (firstK == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
+ if (firstK == kmerChain.getKmerLetterLength()) {
+ kmer.setAsCopy(kmerChain);
return kmer;
}
kmer.reset(firstK);
int i = 1;
- for (; i < kmer.getLength(); i++) {
- kmer.getBytes()[kmer.getLength() - i] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ for (; i < kmer.getKmerByteLength(); i++) {
+ kmer.getBytes()[kmer.getKmerOffset() + kmer.getKmerByteLength() - i] = kmerChain.getBytes()[kmerChain.getKmerOffset() + kmerChain.getKmerByteLength() - i];
}
int posInByteOfChain = (firstK % 4) << 1; // *2
if (posInByteOfChain == 0) {
- kmer.getBytes()[0] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ kmer.getBytes()[0 + kmer.getKmerOffset()] = kmerChain.getBytes()[kmerChain.getKmerOffset() + kmerChain.getKmerByteLength() - i];
} else {
- kmer.getBytes()[0] = (byte) (kmerChain.getBytes()[kmerChain.getLength() - i] & ((1 << posInByteOfChain) - 1));
+ kmer.getBytes()[0 + kmer.getKmerOffset()] = (byte) (kmerChain.getBytes()[kmerChain.getKmerOffset() + kmerChain.getKmerByteLength() - i] & ((1 << posInByteOfChain) - 1));
}
kmer.clearLeadBit();
return kmer;
}
- public KmerBytesWritable getSubKmerFromChain(int startK, int kSize, final KmerBytesWritable kmerChain) {
- if (startK + kSize > kmerChain.getKmerLength()) {
+ public VKmerBytesWritable getSubKmerFromChain(int startK, int kSize, final VKmerBytesWritable kmerChain) {
+ if (startK + kSize > kmerChain.getKmerLetterLength()) {
return null;
}
- if (startK == 0 && kSize == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
+ if (startK == 0 && kSize == kmerChain.getKmerLetterLength()) {
+ kmer.setAsCopy(kmerChain);
return kmer;
}
kmer.reset(kSize);
/** from end to start */
- int byteInChain = kmerChain.getLength() - 1 - startK / 4;
+ int byteInChain = kmerChain.getKmerByteLength() - 1 - startK / 4;
int posInByteOfChain = startK % 4 << 1; // *2
- int byteInKmer = kmer.getLength() - 1;
+ int byteInKmer = kmer.getKmerByteLength() - 1;
for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
- kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
- kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ kmer.getBytes()[byteInKmer + kmer.getKmerOffset()] = (byte) ((0xff & kmerChain.getBytes()[byteInChain + kmerChain.getKmerOffset()]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer + kmer.getKmerOffset()] |= ((kmerChain.getBytes()[byteInChain + kmerChain.getKmerOffset() - 1] << (8 - posInByteOfChain)));
}
/** last kmer byte */
if (byteInKmer == 0) {
- kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ kmer.getBytes()[0 + kmer.getKmerOffset()] = (byte) ((kmerChain.getBytes()[0 + kmerChain.getKmerOffset()] & 0xff) >> posInByteOfChain);
}
kmer.clearLeadBit();
return kmer;
@@ -159,15 +159,15 @@
* : next neighbor in gene-code format
* @return the merged Kmer, this K of this Kmer is k+1
*/
- public KmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
- this.kmer.reset(kmer.getKmerLength() + 1);
- for (int i = 1; i <= kmer.getLength(); i++) {
- this.kmer.getBytes()[this.kmer.getLength() - i] = kmer.getBytes()[kmer.getLength() - i];
+ public VKmerBytesWritable mergeKmerWithNextCode(final VKmerBytesWritable kmer, byte nextCode) {
+ this.kmer.reset(kmer.getKmerLetterLength() + 1);
+ for (int i = 1; i <= kmer.getKmerByteLength(); i++) {
+ this.kmer.getBytes()[this.kmer.getKmerOffset() + this.kmer.getKmerByteLength() - i] = kmer.getBytes()[kmer.getKmerOffset() + kmer.getKmerByteLength() - i];
}
- if (this.kmer.getLength() > kmer.getLength()) {
- this.kmer.getBytes()[0] = (byte) (nextCode & 0x3);
+ if (this.kmer.getKmerByteLength() > kmer.getKmerByteLength()) {
+ this.kmer.getBytes()[0 + kmer.getKmerOffset()] = (byte) (nextCode & 0x3);
} else {
- this.kmer.getBytes()[0] = (byte) (kmer.getBytes()[0] | ((nextCode & 0x3) << ((kmer.getKmerLength() % 4) << 1)));
+ this.kmer.getBytes()[0 + kmer.getKmerOffset()] = (byte) (kmer.getBytes()[0 + kmer.getKmerOffset()] | ((nextCode & 0x3) << ((kmer.getKmerLetterLength() % 4) << 1)));
}
this.kmer.clearLeadBit();
return this.kmer;
@@ -186,17 +186,17 @@
* : next neighbor in gene-code format
* @return the merged Kmer,this K of this Kmer is k+1
*/
- public KmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
- this.kmer.reset(kmer.getKmerLength() + 1);
+ public VKmerBytesWritable mergeKmerWithPreCode(final VKmerBytesWritable kmer, byte preCode) {
+ this.kmer.reset(kmer.getKmerLetterLength() + 1);
int byteInMergedKmer = 0;
- if (kmer.getKmerLength() % 4 == 0) {
- this.kmer.getBytes()[0] = (byte) ((kmer.getBytes()[0] >> 6) & 0x3);
+ if (kmer.getKmerLetterLength() % 4 == 0) {
+ this.kmer.getBytes()[0 + kmer.getKmerOffset()] = (byte) ((kmer.getBytes()[0 + kmer.getKmerOffset()] >> 6) & 0x3);
byteInMergedKmer++;
}
- for (int i = 0; i < kmer.getLength() - 1; i++, byteInMergedKmer++) {
- this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[i] << 2) | ((kmer.getBytes()[i + 1] >> 6) & 0x3));
+ for (int i = 0; i < kmer.getKmerByteLength() - 1; i++, byteInMergedKmer++) {
+ this.kmer.getBytes()[byteInMergedKmer + kmer.getKmerOffset()] = (byte) ((kmer.getBytes()[i + kmer.getKmerOffset()] << 2) | ((kmer.getBytes()[i + kmer.getKmerOffset() + 1] >> 6) & 0x3));
}
- this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[kmer.getLength() - 1] << 2) | (preCode & 0x3));
+ this.kmer.getBytes()[byteInMergedKmer + kmer.getKmerOffset()] = (byte) ((kmer.getBytes()[kmer.getKmerOffset() + kmer.getKmerByteLength() - 1] << 2) | (preCode & 0x3));
this.kmer.clearLeadBit();
return this.kmer;
}
@@ -215,28 +215,28 @@
* : bytes array of next kmer
* @return merged kmer, the new k is @preK + @nextK
*/
- public KmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
- kmer.reset(preKmer.getKmerLength() + nextKmer.getKmerLength());
+ public VKmerBytesWritable mergeTwoKmer(final VKmerBytesWritable preKmer, final VKmerBytesWritable nextKmer) {
+ kmer.reset(preKmer.getKmerLetterLength() + nextKmer.getKmerLetterLength());
int i = 1;
- for (; i <= preKmer.getLength(); i++) {
- kmer.getBytes()[kmer.getLength() - i] = preKmer.getBytes()[preKmer.getLength() - i];
+ for (; i <= preKmer.getKmerByteLength(); i++) {
+ kmer.getBytes()[kmer.getKmerOffset() + kmer.getKmerByteLength() - i] = preKmer.getBytes()[preKmer.getKmerOffset() + preKmer.getKmerByteLength() - i];
}
if (i > 1) {
i--;
}
- if (preKmer.getKmerLength() % 4 == 0) {
- for (int j = 1; j <= nextKmer.getLength(); j++) {
- kmer.getBytes()[kmer.getLength() - i - j] = nextKmer.getBytes()[nextKmer.getLength() - j];
+ if (preKmer.getKmerLetterLength() % 4 == 0) {
+ for (int j = 1; j <= nextKmer.getKmerByteLength(); j++) {
+ kmer.getBytes()[kmer.getKmerOffset() + kmer.getKmerByteLength() - i - j] = nextKmer.getBytes()[nextKmer.getKmerOffset() + nextKmer.getKmerByteLength() - j];
}
} else {
- int posNeedToMove = ((preKmer.getKmerLength() % 4) << 1);
- kmer.getBytes()[kmer.getLength() - i] |= nextKmer.getBytes()[nextKmer.getLength() - 1] << posNeedToMove;
- for (int j = 1; j < nextKmer.getLength(); j++) {
- kmer.getBytes()[kmer.getLength() - i - j] = (byte) (((nextKmer.getBytes()[nextKmer.getLength() - j] & 0xff) >> (8 - posNeedToMove)) | (nextKmer
- .getBytes()[nextKmer.getLength() - j - 1] << posNeedToMove));
+ int posNeedToMove = ((preKmer.getKmerLetterLength() % 4) << 1);
+ kmer.getBytes()[kmer.getKmerOffset() + kmer.getKmerByteLength() - i] |= nextKmer.getBytes()[nextKmer.getKmerOffset() + nextKmer.getKmerByteLength() - 1] << posNeedToMove;
+ for (int j = 1; j < nextKmer.getKmerByteLength(); j++) {
+ kmer.getBytes()[kmer.getKmerOffset() + kmer.getKmerByteLength() - i - j] = (byte) (((nextKmer.getBytes()[nextKmer.getKmerOffset() + nextKmer.getKmerByteLength() - j] & 0xff) >> (8 - posNeedToMove)) | (nextKmer
+ .getBytes()[nextKmer.getKmerOffset() + nextKmer.getKmerByteLength() - j - 1] << posNeedToMove));
}
- if (nextKmer.getKmerLength() % 4 == 0 || (nextKmer.getKmerLength() % 4) * 2 + posNeedToMove > 8) {
- kmer.getBytes()[0] = (byte) ((0xff & nextKmer.getBytes()[0]) >> (8 - posNeedToMove));
+ if (nextKmer.getKmerLetterLength() % 4 == 0 || (nextKmer.getKmerLetterLength() % 4) * 2 + posNeedToMove > 8) {
+ kmer.getBytes()[0 + kmer.getKmerOffset()] = (byte) ((0xff & nextKmer.getBytes()[0 + nextKmer.getKmerOffset()]) >> (8 - posNeedToMove));
}
}
kmer.clearLeadBit();
@@ -255,8 +255,8 @@
* : input genecode
* @return new created kmer that shifted by afterCode, the K will not change
*/
- public KmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode) {
- this.kmer.set(kmer);
+ public VKmerBytesWritable shiftKmerWithNextCode(final VKmerBytesWritable kmer, byte afterCode) {
+ this.kmer.setAsCopy(kmer);
this.kmer.shiftKmerWithNextCode(afterCode);
return this.kmer;
}
@@ -273,8 +273,8 @@
* : input genecode
* @return new created kmer that shifted by preCode, the K will not change
*/
- public KmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
- this.kmer.set(kmer);
+ public VKmerBytesWritable shiftKmerWithPreCode(final VKmerBytesWritable kmer, byte preCode) {
+ this.kmer.setAsCopy(kmer);
this.kmer.shiftKmerWithPreCode(preCode);
return this.kmer;
}
@@ -284,22 +284,22 @@
*
* @param kmer
*/
- public KmerBytesWritable reverse(final KmerBytesWritable kmer) {
- this.kmer.reset(kmer.getKmerLength());
+ public VKmerBytesWritable reverse(final VKmerBytesWritable kmer) {
+ this.kmer.reset(kmer.getKmerLetterLength());
- int curPosAtKmer = ((kmer.getKmerLength() - 1) % 4) << 1;
+ int curPosAtKmer = ((kmer.getKmerLetterLength() - 1) % 4) << 1;
int curByteAtKmer = 0;
int curPosAtReverse = 0;
- int curByteAtReverse = this.kmer.getLength() - 1;
- this.kmer.getBytes()[curByteAtReverse] = 0;
- for (int i = 0; i < kmer.getKmerLength(); i++) {
- byte gene = (byte) ((kmer.getBytes()[curByteAtKmer] >> curPosAtKmer) & 0x03);
- this.kmer.getBytes()[curByteAtReverse] |= gene << curPosAtReverse;
+ int curByteAtReverse = this.kmer.getKmerByteLength() - 1;
+ this.kmer.getBytes()[curByteAtReverse + this.kmer.getKmerOffset()] = 0;
+ for (int i = 0; i < kmer.getKmerLetterLength(); i++) {
+ byte gene = (byte) ((kmer.getBytes()[curByteAtKmer + kmer.getKmerOffset()] >> curPosAtKmer) & 0x03);
+ this.kmer.getBytes()[curByteAtReverse + this.kmer.getKmerOffset()] |= gene << curPosAtReverse;
curPosAtReverse += 2;
if (curPosAtReverse >= 8) {
curPosAtReverse = 0;
- this.kmer.getBytes()[--curByteAtReverse] = 0;
+ this.kmer.getBytes()[--curByteAtReverse + this.kmer.getKmerOffset()] = 0;
}
curPosAtKmer -= 2;
if (curPosAtKmer < 0) {
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java
index 88bb79c..2aee32d 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java
@@ -4,136 +4,156 @@
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.io.Writable;
-import edu.uci.ics.genomix.data.KmerUtil;
import edu.uci.ics.genomix.data.Marshal;
-public class KmerListWritable implements Writable, Iterable<KmerBytesWritable>, Serializable{
+/**
+ * A list of fixed-length kmers. The length of this list is stored internally.
+ */
+public class KmerListWritable implements Writable, Iterable<KmerBytesWritable>, Serializable {
private static final long serialVersionUID = 1L;
+ protected static final byte[] EMPTY_BYTES = { 0, 0, 0, 0 };
+ protected static final int HEADER_SIZE = 4;
+
protected byte[] storage;
protected int offset;
protected int valueCount;
- public int kmerByteSize = 0;
- public int kmerlength = 0;
- protected static final byte[] EMPTY = {};
-
- protected KmerBytesWritable posIter = new KmerBytesWritable();
-
+ protected int storageMaxSize; // since we may be a reference inside a larger datablock, we must track our maximum size
+
+ private KmerBytesWritable posIter = new KmerBytesWritable();
+
public KmerListWritable() {
- this.storage = EMPTY;
- this.valueCount = 0;
- this.offset = 0;
+ storage = EMPTY_BYTES;
+ valueCount = 0;
+ offset = 0;
+ storageMaxSize = storage.length;
}
-
- public KmerListWritable(int kmerlength) {
- this();
- this.kmerlength = kmerlength;
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
+
+ public KmerListWritable(byte[] data, int offset) {
+ setNewReference(data, offset);
}
-
- public KmerListWritable(int kmerlength, int count, byte[] data, int offset) {
- this.kmerlength = kmerlength;
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- setNewReference(count, data, offset);
- }
-
+
public KmerListWritable(List<KmerBytesWritable> kmers) {
this();
- setSize(kmers.size()); // reserve space for all elements
+ setSize(kmers.size() * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE); // reserve space for all elements
for (KmerBytesWritable kmer : kmers) {
append(kmer);
}
}
-
- public void setNewReference(int count, byte[] data, int offset) {
- this.valueCount = count;
+
+ public void setNewReference(byte[] data, int offset) {
+ valueCount = Marshal.getInt(data, offset);
+ if (valueCount * KmerBytesWritable.getBytesPerKmer() > data.length - offset) {
+ throw new IllegalArgumentException("Specified data buffer (len=" + (data.length - offset)
+ + ") is not large enough to store requested number of elements (" + valueCount + ")!");
+ }
this.storage = data;
this.offset = offset;
+ this.storageMaxSize = valueCount * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE;
}
-
- public void append(KmerBytesWritable kmer){
- if(kmer != null){
- kmerByteSize = kmer.kmerByteSize;
- kmerlength = kmer.kmerlength;
- setSize((1 + valueCount) * kmerByteSize);
- System.arraycopy(kmer.getBytes(), 0, storage, offset + valueCount * kmerByteSize, kmerByteSize);
- valueCount += 1;
- }
+
+ public void append(KmerBytesWritable kmer) {
+ setSize((1 + valueCount) * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE);
+ System.arraycopy(kmer.getBytes(), 0, storage,
+ offset + HEADER_SIZE + valueCount * KmerBytesWritable.getBytesPerKmer(),
+ KmerBytesWritable.getBytesPerKmer());
+ valueCount += 1;
+ Marshal.putInt(valueCount, storage, offset);
}
-
+
/*
* Append the otherList to the end of myList
*/
public void appendList(KmerListWritable otherList) {
if (otherList.valueCount > 0) {
- setSize((valueCount + otherList.valueCount) * kmerByteSize);
+ setSize((valueCount + otherList.valueCount) * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE);
// copy contents of otherList into the end of my storage
- System.arraycopy(otherList.storage, otherList.offset,
- storage, offset + valueCount * kmerByteSize,
- otherList.valueCount * kmerByteSize);
+ System.arraycopy(otherList.storage, otherList.offset + HEADER_SIZE, storage, offset + HEADER_SIZE
+ + valueCount * KmerBytesWritable.getBytesPerKmer(),
+ otherList.valueCount * KmerBytesWritable.getBytesPerKmer());
valueCount += otherList.valueCount;
+ Marshal.putInt(valueCount, storage, offset);
}
}
-
+
+ /**
+ * Save the union of my list and otherList. Uses a temporary HashSet for
+ * uniquefication
+ */
+ public void unionUpdate(KmerListWritable otherList) {
+ int newSize = valueCount + otherList.valueCount;
+ HashSet<KmerBytesWritable> uniqueElements = new HashSet<KmerBytesWritable>(newSize);
+ for (KmerBytesWritable kmer : this) {
+ uniqueElements.add(kmer);
+ }
+ for (KmerBytesWritable kmer : otherList) {
+ uniqueElements.add(kmer);
+ }
+ valueCount = 0;
+ setSize(newSize * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE);
+ for (KmerBytesWritable kmer : uniqueElements) {
+ append(kmer);
+ }
+ Marshal.putInt(valueCount, storage, offset);
+ }
+
protected void setSize(int size) {
if (size > getCapacity()) {
setCapacity((size * 3 / 2));
}
}
-
+
protected int getCapacity() {
- return storage.length - offset;
+ return storageMaxSize - offset;
}
protected void setCapacity(int new_cap) {
if (new_cap > getCapacity()) {
byte[] new_data = new byte[new_cap];
- if (storage.length - offset > 0) {
- System.arraycopy(storage, offset, new_data, 0, storage.length - offset);
+ if (valueCount > 0) {
+ System.arraycopy(storage, offset, new_data, 0, valueCount * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE);
}
storage = new_data;
offset = 0;
+ storageMaxSize = storage.length;
}
}
-
+
public void reset() {
- this.reset(0);
- }
-
- public void reset(int kmerSize) {
- kmerlength = kmerSize;
- kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- storage = EMPTY;
valueCount = 0;
- offset = 0;
}
-
+
public KmerBytesWritable getPosition(int i) {
if (i >= valueCount) {
throw new ArrayIndexOutOfBoundsException("No such positions");
}
- posIter.setNewReference(kmerlength, storage, offset + i * kmerByteSize);
+ posIter.setAsReference(storage, offset + HEADER_SIZE + i * KmerBytesWritable.getBytesPerKmer());
return posIter;
}
-
- public void set(KmerListWritable otherList) {
- this.kmerlength = otherList.kmerlength;
- this.kmerByteSize = otherList.kmerByteSize;
- set(otherList.valueCount, otherList.storage, otherList.offset);
+
+ public void setCopy(KmerListWritable otherList) {
+ setCopy(otherList.storage, otherList.offset);
}
- public void set(int valueCount, byte[] newData, int offset) {
- this.valueCount = valueCount;
- setSize(valueCount * kmerByteSize);
- if (valueCount > 0) {
- System.arraycopy(newData, offset, storage, this.offset, valueCount * kmerByteSize);
+ /**
+ * read a KmerListWritable from newData, which should include the header
+ */
+ public void setCopy(byte[] newData, int offset) {
+ int newValueCount = Marshal.getInt(newData, offset);
+ setSize(newValueCount * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE);
+ if (newValueCount > 0) {
+ System.arraycopy(newData, offset + HEADER_SIZE, storage, this.offset + HEADER_SIZE, newValueCount
+ * KmerBytesWritable.getBytesPerKmer());
}
+ valueCount = newValueCount;
+ Marshal.putInt(valueCount, storage, this.offset);
}
-
+
@Override
public Iterator<KmerBytesWritable> iterator() {
Iterator<KmerBytesWritable> it = new Iterator<KmerBytesWritable>() {
@@ -152,50 +172,54 @@
@Override
public void remove() {
- if(currentIndex < valueCount)
- System.arraycopy(storage, offset + currentIndex * kmerByteSize,
- storage, offset + (currentIndex - 1) * kmerByteSize,
- (valueCount - currentIndex) * kmerByteSize);
+ if (currentIndex < valueCount)
+ System.arraycopy(storage, offset + currentIndex * KmerBytesWritable.getBytesPerKmer(), storage,
+ offset + (currentIndex - 1) * KmerBytesWritable.getBytesPerKmer(),
+ (valueCount - currentIndex) * KmerBytesWritable.getBytesPerKmer());
valueCount--;
currentIndex--;
+ Marshal.putInt(valueCount, storage, offset);
}
};
return it;
}
-
+
/*
- * remove the first instance of @toRemove. Uses a linear scan. Throws an exception if not in this list.
+ * remove the first instance of `toRemove`. Uses a linear scan. Throws an
+ * exception if not in this list.
*/
public void remove(KmerBytesWritable toRemove, boolean ignoreMissing) {
Iterator<KmerBytesWritable> posIterator = this.iterator();
while (posIterator.hasNext()) {
- if(toRemove.equals(posIterator.next())) {
+ if (toRemove.equals(posIterator.next())) {
posIterator.remove();
- return;
+ return; // break as soon as the element is found
}
}
+ // element was not found
if (!ignoreMissing) {
- throw new ArrayIndexOutOfBoundsException("the KmerBytesWritable `" + toRemove.toString() + "` was not found in this list.");
+ throw new ArrayIndexOutOfBoundsException("the KmerBytesWritable `" + toRemove.toString()
+ + "` was not found in this list.");
}
}
-
+
public void remove(KmerBytesWritable toRemove) {
remove(toRemove, false);
}
@Override
public void readFields(DataInput in) throws IOException {
- this.valueCount = in.readInt();
- setSize(valueCount * kmerByteSize);//kmerByteSize
- in.readFully(storage, offset, valueCount * kmerByteSize);//kmerByteSize
+ valueCount = in.readInt();
+ setSize(valueCount * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE);
+ in.readFully(storage, offset + HEADER_SIZE, valueCount * KmerBytesWritable.getBytesPerKmer() - HEADER_SIZE);
+ Marshal.putInt(valueCount, storage, offset);
}
@Override
public void write(DataOutput out) throws IOException {
- out.writeInt(valueCount);
- out.write(storage, offset, valueCount * kmerByteSize);
+ out.write(storage, offset, valueCount * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE);
}
-
+
public int getCountOfPosition() {
return valueCount;
}
@@ -207,16 +231,16 @@
public int getStartOffset() {
return offset;
}
-
+
public int getLength() {
- return valueCount * kmerByteSize;
+ return valueCount * KmerBytesWritable.getBytesPerKmer() + HEADER_SIZE;
}
-
+
@Override
public String toString() {
StringBuilder sbuilder = new StringBuilder();
sbuilder.append('[');
- for(int i = 0; i < valueCount; i++){
+ for (int i = 0; i < valueCount; i++) {
sbuilder.append(getPosition(i).toString());
sbuilder.append(',');
}
@@ -227,7 +251,7 @@
}
return sbuilder.toString();
}
-
+
@Override
public int hashCode() {
return Marshal.hashBytes(getByteArray(), getStartOffset(), getLength());
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
index efa87f7..362c12e 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
@@ -1,25 +1,27 @@
package edu.uci.ics.genomix.type;
+import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
+import java.io.DataOutputStream;
import java.io.IOException;
import java.io.Serializable;
+import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.WritableComparable;
public class NodeWritable implements WritableComparable<NodeWritable>, Serializable{
private static final long serialVersionUID = 1L;
- public static final NodeWritable EMPTY_NODE = new NodeWritable(0);
+ public static final NodeWritable EMPTY_NODE = new NodeWritable();
private PositionListWritable nodeIdList;
private KmerListWritable forwardForwardList;
private KmerListWritable forwardReverseList;
private KmerListWritable reverseForwardList;
private KmerListWritable reverseReverseList;
- private KmerBytesWritable kmer;
- private int kmerlength = 0;
+ private VKmerBytesWritable kmer;
// merge/update directions
public static class DirectionFlag {
@@ -31,48 +33,41 @@
}
public NodeWritable() {
- this(0);
- }
-
- public NodeWritable(int kmerlenth) {
- this.kmerlength = kmerlenth;
nodeIdList = new PositionListWritable();
- forwardForwardList = new KmerListWritable(kmerlenth);
- forwardReverseList = new KmerListWritable(kmerlenth);
- reverseForwardList = new KmerListWritable(kmerlenth);
- reverseReverseList = new KmerListWritable(kmerlenth);
- kmer = new KmerBytesWritable(); //in graph construction - not set kmerlength Optimization: VKmer
+ forwardForwardList = new KmerListWritable();
+ forwardReverseList = new KmerListWritable();
+ reverseForwardList = new KmerListWritable();
+ reverseReverseList = new KmerListWritable();
+ kmer = new VKmerBytesWritable(); // in graph construction - not set kmerlength Optimization: VKmer
}
public NodeWritable(PositionListWritable nodeIdList, KmerListWritable FFList, KmerListWritable FRList,
- KmerListWritable RFList, KmerListWritable RRList, KmerBytesWritable kmer) {
- this(kmer.getKmerLength());
+ KmerListWritable RFList, KmerListWritable RRList, VKmerBytesWritable kmer) {
+ this();
set(nodeIdList, FFList, FRList, RFList, RRList, kmer);
}
public void set(NodeWritable node){
- this.kmerlength = node.kmerlength;
set(node.nodeIdList, node.forwardForwardList, node.forwardReverseList, node.reverseForwardList,
node.reverseReverseList, node.kmer);
}
public void set(PositionListWritable nodeIdList, KmerListWritable FFList, KmerListWritable FRList,
- KmerListWritable RFList, KmerListWritable RRList, KmerBytesWritable kmer) {
+ KmerListWritable RFList, KmerListWritable RRList, VKmerBytesWritable kmer2) {
this.nodeIdList.set(nodeIdList);
- this.forwardForwardList.set(FFList);
- this.forwardReverseList.set(FRList);
- this.reverseForwardList.set(RFList);
- this.reverseReverseList.set(RRList);
- this.kmer.set(kmer);
+ this.forwardForwardList.setCopy(FFList);
+ this.forwardReverseList.setCopy(FRList);
+ this.reverseForwardList.setCopy(RFList);
+ this.reverseReverseList.setCopy(RRList);
+ this.kmer.setAsCopy(kmer2);
}
- public void reset(int kmerSize) {
- this.kmerlength = kmerSize;
+ public void reset() {
this.nodeIdList.reset();
- this.forwardForwardList.reset(kmerSize);
- this.forwardReverseList.reset(kmerSize);
- this.reverseForwardList.reset(kmerSize);
- this.reverseReverseList.reset(kmerSize);
+ this.forwardForwardList.reset();
+ this.forwardReverseList.reset();
+ this.reverseForwardList.reset();
+ this.reverseReverseList.reset();
this.kmer.reset(0);
}
@@ -85,24 +80,16 @@
this.nodeIdList.set(nodeIdList);
}
- public KmerBytesWritable getKmer() {
+ public VKmerBytesWritable getKmer() {
return kmer;
}
- public void setKmer(KmerBytesWritable kmer) {
- this.kmer.set(kmer);
+ public void setKmer(VKmerBytesWritable kmer) {
+ this.kmer.setAsCopy(kmer);
}
- public int getKmerlength() {
- return kmerlength;
- }
-
- public void setKmerlength(int kmerlength) {
- this.kmerlength = kmerlength;
- }
-
- public int getCount() {
- return kmer.getKmerLength();
+ public int getKmerLength() {
+ return kmer.getKmerLetterLength();
}
public KmerListWritable getFFList() {
@@ -122,19 +109,19 @@
}
public void setFFList(KmerListWritable forwardForwardList) {
- this.forwardForwardList.set(forwardForwardList);
+ this.forwardForwardList.setCopy(forwardForwardList);
}
public void setFRList(KmerListWritable forwardReverseList) {
- this.forwardReverseList.set(forwardReverseList);
+ this.forwardReverseList.setCopy(forwardReverseList);
}
public void setRFList(KmerListWritable reverseForwardList) {
- this.reverseForwardList.set(reverseForwardList);
+ this.reverseForwardList.setCopy(reverseForwardList);
}
public void setRRList(KmerListWritable reverseReverseList) {
- this.reverseReverseList.set(reverseReverseList);
+ this.reverseReverseList.setCopy(reverseReverseList);
}
public KmerListWritable getListFromDir(byte dir) {
@@ -152,9 +139,60 @@
}
}
+ /**
+ * Returns the length of the byte-array version of this node
+ */
+ public int getSerializedLength() {
+ return nodeIdList.getLength() + forwardForwardList.getLength() + forwardReverseList.getLength() +
+ reverseForwardList.getLength() + reverseReverseList.getLength() + kmer.getLength();
+ }
+
+ /**
+ * Return this Node's representation as a new byte array
+ */
+ public byte[] marshalToByteArray() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(getSerializedLength());
+ DataOutputStream out = new DataOutputStream(baos);
+ write(out);
+ return baos.toByteArray();
+ }
+
+ public void setAsCopy(byte[] data, int offset) {
+ int curOffset = offset;
+ nodeIdList.set(data, curOffset);
+
+ curOffset += nodeIdList.getLength();
+ forwardForwardList.setCopy(data, curOffset);
+ curOffset += forwardForwardList.getLength();
+ forwardReverseList.setCopy(data, curOffset);
+ curOffset += forwardReverseList.getLength();
+ reverseForwardList.setCopy(data, curOffset);
+ curOffset += reverseForwardList.getLength();
+ reverseReverseList.setCopy(data, curOffset);
+
+ curOffset += reverseReverseList.getLength();
+ kmer.setAsCopy(data, curOffset);
+ }
+
+ public void setAsReference(byte[] data, int offset) {
+ int curOffset = offset;
+ nodeIdList.setNewReference(data, curOffset);
+
+ curOffset += nodeIdList.getLength();
+ forwardForwardList.setNewReference(data, curOffset);
+ curOffset += forwardForwardList.getLength();
+ forwardReverseList.setNewReference(data, curOffset);
+ curOffset += forwardReverseList.getLength();
+ reverseForwardList.setNewReference(data, curOffset);
+ curOffset += reverseForwardList.getLength();
+ reverseReverseList.setNewReference(data, curOffset);
+
+ curOffset += reverseReverseList.getLength();
+ kmer.setAsReference(data, curOffset);
+ }
+
@Override
public void write(DataOutput out) throws IOException {
- out.writeInt(kmerlength);
this.nodeIdList.write(out);
this.forwardForwardList.write(out);
this.forwardReverseList.write(out);
@@ -165,8 +203,7 @@
@Override
public void readFields(DataInput in) throws IOException {
- this.kmerlength = in.readInt();
- reset(kmerlength);
+ reset();
this.nodeIdList.readFields(in);
this.forwardForwardList.readFields(in);
this.forwardReverseList.readFields(in);
@@ -211,15 +248,15 @@
return sbuilder.toString();
}
- public void mergeForwardNext(NodeWritable nextNode, int initialKmerSize) {
- this.forwardForwardList.set(nextNode.forwardForwardList);
- this.forwardReverseList.set(nextNode.forwardReverseList);
+ public void mergeForwardNext(final NodeWritable nextNode, int initialKmerSize) {
+ this.forwardForwardList.setCopy(nextNode.forwardForwardList);
+ this.forwardReverseList.setCopy(nextNode.forwardReverseList);
kmer.mergeWithFFKmer(initialKmerSize, nextNode.getKmer());
}
- public void mergeForwardPre(NodeWritable preNode, int initialKmerSize) {
- this.reverseForwardList.set(preNode.reverseForwardList);
- this.reverseReverseList.set(preNode.reverseReverseList);
+ public void mergeForwardPre(final NodeWritable preNode, int initialKmerSize) {
+ this.reverseForwardList.setCopy(preNode.reverseForwardList);
+ this.reverseReverseList.setCopy(preNode.reverseReverseList);
kmer.mergeWithRRKmer(initialKmerSize, preNode.getKmer());
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java
index f135292..8de4b0e 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java
@@ -4,6 +4,7 @@
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@@ -12,129 +13,160 @@
import edu.uci.ics.genomix.data.Marshal;
import edu.uci.ics.genomix.type.PositionWritable;
-public class PositionListWritable implements Writable, Iterable<PositionWritable>, Serializable{
+public class PositionListWritable implements Writable, Iterable<PositionWritable>, Serializable {
private static final long serialVersionUID = 1L;
+ protected static final byte[] EMPTY_BYTES = {0,0,0,0};
+ protected static final int HEADER_SIZE = 4;
+
protected byte[] storage;
protected int offset;
protected int valueCount;
- protected static final byte[] EMPTY = {};
-
+ protected int maxStorageSize;
+
+
protected PositionWritable posIter = new PositionWritable();
-
+
public PositionListWritable() {
- this.storage = EMPTY;
- this.valueCount = 0;
- this.offset = 0;
+ storage = EMPTY_BYTES;
+ valueCount = 0;
+ offset = 0;
+ maxStorageSize = storage.length;
}
-
- public PositionListWritable(int count, byte[] data, int offset) {
- setNewReference(count, data, offset);
+
+ public PositionListWritable(byte[] data, int offset) {
+ setNewReference(data, offset);
}
-
+
public PositionListWritable(List<PositionWritable> posns) {
this();
- setSize(posns.size()); // reserve space for all elements
+ setSize(posns.size() * PositionWritable.LENGTH + HEADER_SIZE); // reserve space for all elements
for (PositionWritable p : posns) {
append(p);
}
}
-
- public void setNewReference(int count, byte[] data, int offset) {
- this.valueCount = count;
+
+ public void setNewReference(byte[] data, int offset) {
+ this.valueCount = Marshal.getInt(data, offset);
this.storage = data;
this.offset = offset;
+ maxStorageSize = valueCount * PositionWritable.LENGTH + HEADER_SIZE;
}
-
+
public void append(long uuid) {
- setSize((1 + valueCount) * PositionWritable.LENGTH);
- Marshal.putLong(uuid, storage, offset + valueCount * PositionWritable.LENGTH);
+ setSize((1 + valueCount) * PositionWritable.LENGTH + HEADER_SIZE);
+ Marshal.putLong(uuid, storage, offset + valueCount * PositionWritable.LENGTH + HEADER_SIZE);
valueCount += 1;
+ Marshal.putInt(valueCount, storage, offset);
}
-
- public void append(byte mateId, long readId, int posId){
+
+ public void append(byte mateId, long readId, int posId) {
append(PositionWritable.makeUUID(mateId, readId, posId));
}
-
+
public void append(PositionWritable pos) {
- if(pos != null)
+ if (pos != null)
append(pos.getUUID());
else
throw new RuntimeException("This position is null pointer!");
}
-
+
/*
* Append the otherList to the end of myList
*/
public void appendList(PositionListWritable otherList) {
if (otherList.valueCount > 0) {
- setSize((valueCount + otherList.valueCount) * PositionWritable.LENGTH);
+ setSize((valueCount + otherList.valueCount) * PositionWritable.LENGTH + HEADER_SIZE);
// copy contents of otherList into the end of my storage
- System.arraycopy(otherList.storage, otherList.offset,
- storage, offset + valueCount * PositionWritable.LENGTH,
- otherList.valueCount * PositionWritable.LENGTH);
+ System.arraycopy(otherList.storage, otherList.offset + HEADER_SIZE, storage, offset + valueCount
+ * PositionWritable.LENGTH + HEADER_SIZE, otherList.valueCount * PositionWritable.LENGTH);
valueCount += otherList.valueCount;
+ Marshal.putInt(valueCount, storage, offset);
}
}
-
+
+ /**
+ * Save the union of my list and otherList. Uses a temporary HashSet for
+ * uniquefication
+ */
+ public void unionUpdate(PositionListWritable otherList) {
+ int newSize = valueCount + otherList.valueCount;
+ HashSet<PositionWritable> uniqueElements = new HashSet<PositionWritable>(newSize);
+ for (PositionWritable pos : this) {
+ uniqueElements.add(pos);
+ }
+ for (PositionWritable pos : otherList) {
+ uniqueElements.add(pos);
+ }
+ valueCount = 0;
+ setSize(newSize * PositionWritable.LENGTH + HEADER_SIZE);
+ for (PositionWritable pos : uniqueElements) {
+ append(pos);
+ }
+ }
+
public static int getCountByDataLength(int length) {
if (length % PositionWritable.LENGTH != 0) {
throw new IllegalArgumentException("Length of positionlist is invalid");
}
return length / PositionWritable.LENGTH;
}
-
+
public void set(PositionListWritable otherList) {
- set(otherList.valueCount, otherList.storage, otherList.offset);
+ set(otherList.storage, otherList.offset);
}
- public void set(int valueCount, byte[] newData, int offset) {
- this.valueCount = valueCount;
- setSize(valueCount * PositionWritable.LENGTH);
- if (valueCount > 0) {
- System.arraycopy(newData, offset, storage, this.offset, valueCount * PositionWritable.LENGTH);
+ public void set(byte[] newData, int newOffset) {
+ int newValueCount = Marshal.getInt(newData, newOffset);
+ setSize(newValueCount * PositionWritable.LENGTH + HEADER_SIZE);
+ if (newValueCount > 0) {
+ System.arraycopy(newData, newOffset + HEADER_SIZE, storage, this.offset + HEADER_SIZE, newValueCount * PositionWritable.LENGTH);
}
+ valueCount = newValueCount;
+ Marshal.putInt(valueCount, storage, this.offset);
}
public void reset() {
valueCount = 0;
+ Marshal.putInt(valueCount, storage, offset);
}
-
+
protected void setSize(int size) {
if (size > getCapacity()) {
setCapacity((size * 3 / 2));
}
}
-
+
protected int getCapacity() {
- return storage.length - offset;
+ return maxStorageSize - offset;
}
protected void setCapacity(int new_cap) {
if (new_cap > getCapacity()) {
byte[] new_data = new byte[new_cap];
- if (storage.length - offset > 0) {
- System.arraycopy(storage, offset, new_data, 0, storage.length - offset);
+ if (valueCount > 0) {
+ System.arraycopy(storage, offset, new_data, 0, valueCount * PositionWritable.LENGTH + HEADER_SIZE);
}
storage = new_data;
offset = 0;
+ maxStorageSize = storage.length;
}
}
-
+
public PositionWritable getPosition(int i) {
if (i >= valueCount) {
throw new ArrayIndexOutOfBoundsException("No such positions");
}
- posIter.setNewReference(storage, offset + i * PositionWritable.LENGTH);
+ posIter.setNewReference(storage, offset + i * PositionWritable.LENGTH + HEADER_SIZE);
return posIter;
}
-
+
public void resetPosition(int i, long uuid) {
if (i >= valueCount) {
throw new ArrayIndexOutOfBoundsException("No such positions");
}
- Marshal.putLong(uuid, storage, offset + i * PositionWritable.LENGTH);
+ Marshal.putLong(uuid, storage, offset + i * PositionWritable.LENGTH + HEADER_SIZE);
}
-
+
public int getCountOfPosition() {
return valueCount;
}
@@ -148,9 +180,9 @@
}
public int getLength() {
- return valueCount * PositionWritable.LENGTH;
+ return valueCount * PositionWritable.LENGTH + HEADER_SIZE;
}
-
+
@Override
public Iterator<PositionWritable> iterator() {
Iterator<PositionWritable> it = new Iterator<PositionWritable>() {
@@ -169,50 +201,54 @@
@Override
public void remove() {
- if(currentIndex < valueCount)
- System.arraycopy(storage, offset + currentIndex * PositionWritable.LENGTH,
- storage, offset + (currentIndex - 1) * PositionWritable.LENGTH,
- (valueCount - currentIndex) * PositionWritable.LENGTH);
+ if (currentIndex < valueCount)
+ System.arraycopy(storage, offset + currentIndex * PositionWritable.LENGTH + HEADER_SIZE, storage, offset
+ + (currentIndex - 1) * PositionWritable.LENGTH + HEADER_SIZE, (valueCount - currentIndex)
+ * PositionWritable.LENGTH);
valueCount--;
currentIndex--;
+ Marshal.putInt(valueCount, storage, offset);
}
};
return it;
}
-
+
/*
* remove the first instance of @toRemove. Uses a linear scan. Throws an exception if not in this list.
*/
public void remove(PositionWritable toRemove, boolean ignoreMissing) {
Iterator<PositionWritable> posIterator = this.iterator();
while (posIterator.hasNext()) {
- if(toRemove.equals(posIterator.next())) {
+ if (toRemove.equals(posIterator.next())) {
posIterator.remove();
- return;
+ return; // found it. return early.
}
}
+ // element not found.
if (!ignoreMissing) {
- throw new ArrayIndexOutOfBoundsException("the PositionWritable `" + toRemove.toString() + "` was not found in this list.");
+ throw new ArrayIndexOutOfBoundsException("the PositionWritable `" + toRemove.toString()
+ + "` was not found in this list.");
}
}
-
+
public void remove(PositionWritable toRemove) {
- remove(toRemove, false);
+ remove(toRemove, false);
}
-
+
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(valueCount);
- out.write(storage, offset, valueCount * PositionWritable.LENGTH);
+ out.write(storage, offset + HEADER_SIZE, valueCount * PositionWritable.LENGTH);
}
-
+
@Override
public void readFields(DataInput in) throws IOException {
this.valueCount = in.readInt();
- setSize(valueCount * PositionWritable.LENGTH);
- in.readFully(storage, offset, valueCount * PositionWritable.LENGTH);
+ setSize(valueCount * PositionWritable.LENGTH + HEADER_SIZE);
+ in.readFully(storage, offset + HEADER_SIZE, valueCount * PositionWritable.LENGTH);
+ Marshal.putInt(valueCount, storage, offset);
}
-
+
@Override
public String toString() {
StringBuilder sbuilder = new StringBuilder();
@@ -228,12 +264,12 @@
}
return sbuilder.toString();
}
-
+
@Override
public int hashCode() {
return Marshal.hashBytes(getByteArray(), getStartOffset(), getLength());
}
-
+
@Override
public boolean equals(Object o) {
if (!(o instanceof PositionListWritable))
@@ -241,9 +277,9 @@
PositionListWritable other = (PositionListWritable) o;
if (this.valueCount != other.valueCount)
return false;
- for (int i=0; i < this.valueCount; i++) {
- if (!this.getPosition(i).equals(other.getPosition(i)))
- return false;
+ for (int i = 0; i < this.valueCount; i++) {
+ if (!this.getPosition(i).equals(other.getPosition(i)))
+ return false;
}
return true;
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionWritable.java
index 1079677..03d66a6 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionWritable.java
@@ -119,10 +119,10 @@
}
/*
- * String of form "(readId-posID_mate)" where mate is _1 or _2
+ * String of form "(readId-posID_mate)" where mate is _0 or _1
*/
@Override
public String toString() {
- return "(" + this.getReadId() + "-" + this.getPosId() + "_" + (this.getMateId() + 1) + ")";
+ return "(" + this.getReadId() + "-" + this.getPosId() + "_" + (this.getMateId()) + ")";
}
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
new file mode 100644
index 0000000..df93069
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -0,0 +1,646 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+import edu.uci.ics.genomix.data.KmerUtil;
+import edu.uci.ics.genomix.data.Marshal;
+import edu.uci.ics.genomix.type.NodeWritable.DirectionFlag;
+
+
+/**
+ * Variable-length kmer which stores its length internally.
+ * Note: `offset` as used in this class is the offset at which the *kmer*
+ * begins. There is a {@value HEADER_SIZE}-byte header preceding the kmer
+ */
+public class VKmerBytesWritable extends BinaryComparable implements Serializable, WritableComparable<BinaryComparable> {
+ private static final long serialVersionUID = 1L;
+ protected static final byte[] EMPTY_BYTES = { 0, 0, 0, 0 }; // int indicating 0 length
+ protected static final int HEADER_SIZE = 4; // number of bytes for header info
+
+ protected int lettersInKmer;
+ protected int bytesUsed;
+ protected byte[] bytes;
+ protected int kmerStartOffset;
+ protected int storageMaxSize; // since we may be a reference inside a larger datablock, we must track our maximum size
+
+ /**
+ * Initialize as empty kmer
+ */
+ public VKmerBytesWritable() {
+ this(EMPTY_BYTES, 0);
+ }
+
+ /**
+ * Copy contents of kmer string
+ */
+ public VKmerBytesWritable(String kmer) {
+ bytes = new byte[HEADER_SIZE + KmerUtil.getByteNumFromK(kmer.length())];
+ kmerStartOffset = HEADER_SIZE;
+ storageMaxSize = bytes.length;
+ setAsCopy(kmer);
+ }
+
+ /**
+ * Set as reference to given data
+ *
+ * @param storage
+ * : byte array with header
+ * @param offset
+ */
+ public VKmerBytesWritable(byte[] storage, int offset) {
+ setAsReference(storage, offset);
+ }
+
+ /**
+ * Reserve space for k letters
+ */
+ public VKmerBytesWritable(int k) {
+ if (k > 0) {
+ bytes = new byte[HEADER_SIZE + KmerUtil.getByteNumFromK(k)];
+ } else if (k == 0) {
+ bytes = EMPTY_BYTES;
+ } else {
+ throw new IllegalArgumentException("Invalid K (" + k + ").");
+ }
+ kmerStartOffset = HEADER_SIZE;
+ storageMaxSize = bytes.length;
+ setKmerLength(k);
+ }
+
+ /**
+ * deep copy of kmer in other
+ *
+ * @param other
+ */
+ public VKmerBytesWritable(VKmerBytesWritable other) {
+ this(other.lettersInKmer);
+ setAsCopy(other);
+ }
+
+ /**
+ * Deep copy of the given kmer
+ *
+ * @param other
+ */
+ public void setAsCopy(VKmerBytesWritable other) {
+ reset(other.lettersInKmer);
+ if (lettersInKmer > 0) {
+ System.arraycopy(other.bytes, other.kmerStartOffset, bytes, this.kmerStartOffset, bytesUsed);
+ }
+ }
+
+ /**
+ * set from String kmer
+ */
+ public void setAsCopy(String kmer) {
+ int k = kmer.length();
+ reset(k);
+ System.arraycopy(kmer.getBytes(), 0, bytes, kmerStartOffset, bytesUsed);
+ }
+
+ /**
+ * Deep copy of the given bytes data
+ *
+ * @param newData
+ * : byte array to copy (should have a header)
+ * @param offset
+ */
+ public void setAsCopy(byte[] newData, int offset) {
+ int k = Marshal.getInt(newData, offset);
+ reset(k);
+ System.arraycopy(newData, offset + HEADER_SIZE, bytes, this.kmerStartOffset, bytesUsed);
+ }
+
+ /**
+ * Point this datablock to the given bytes array It works like the pointer
+ * to new datablock.
+ *
+ * @param newData
+ * : byte array to copy (should have a header)
+ * @param blockOffset
+ */
+ public void setAsReference(byte[] newData, int blockOffset) {
+ bytes = newData;
+ kmerStartOffset = blockOffset + HEADER_SIZE;
+ int kRequested = Marshal.getInt(newData, blockOffset);
+ int bytesRequested = KmerUtil.getByteNumFromK(kRequested) + HEADER_SIZE;
+ if (newData.length - blockOffset < bytesRequested) {
+ throw new IllegalArgumentException("Requested " + bytesRequested + " bytes (k=" + kRequested
+ + ") but buffer has only " + (newData.length - blockOffset) + " bytes");
+ }
+ storageMaxSize = bytesRequested; // since we are a reference, store our max capacity
+ setKmerLength(kRequested);
+ }
+
+ /**
+ * Reset array by kmerlength
+ *
+ * @param k
+ */
+ public void reset(int k) {
+ int newByteLength = KmerUtil.getByteNumFromK(k);
+ if (bytesUsed < newByteLength) {
+ bytes = new byte[newByteLength + HEADER_SIZE];
+ kmerStartOffset = HEADER_SIZE;
+ storageMaxSize = bytes.length;
+ }
+ setKmerLength(k);
+ }
+
+ protected void clearLeadBit() {
+ if (lettersInKmer % 4 != 0) {
+ bytes[kmerStartOffset] &= (1 << ((lettersInKmer % 4) << 1)) - 1;
+ }
+ }
+
+ /**
+ * Get one genecode (A|G|C|T) from the given kmer index e.g. Get the 4th
+ * gene of the kmer ACGTA will return T
+ *
+ * @param pos
+ * @return
+ */
+ public byte getGeneCodeAtPosition(int pos) {
+ if (pos >= lettersInKmer || pos < 0) {
+ throw new ArrayIndexOutOfBoundsException("Gene position (" + pos + ") out of bounds for k=" + lettersInKmer);
+ }
+ return geneCodeAtPosition(pos);
+ }
+
+ /**
+ * unchecked version of getGeneCodeAtPosition. Used when kmerlength is
+ * inaccurate (mid-merge)
+ */
+ private byte geneCodeAtPosition(int pos) {
+ int posByte = pos / 4;
+ int shift = (pos % 4) << 1;
+ return (byte) ((bytes[kmerStartOffset + bytesUsed - 1 - posByte] >> shift) & 0x3);
+ }
+
+ /**
+ * Shift Kmer to accept new char input
+ *
+ * @param c
+ * Input new gene character
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextChar(byte c) {
+ return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextCode(byte c) {
+ byte output = (byte) (bytes[kmerStartOffset + bytesUsed - 1] & 0x03);
+ for (int i = bytesUsed - 1; i > 0; i--) {
+ byte in = (byte) (bytes[kmerStartOffset + i - 1] & 0x03);
+ bytes[kmerStartOffset + i] = (byte) (((bytes[kmerStartOffset + i] >>> 2) & 0x3f) | (in << 6));
+ }
+ int pos = ((lettersInKmer - 1) % 4) << 1;
+ byte code = (byte) (c << pos);
+ bytes[kmerStartOffset] = (byte) (((bytes[kmerStartOffset] >>> 2) & 0x3f) | code);
+ clearLeadBit();
+ return output;
+ }
+
+ /**
+ * Shift Kmer to accept new input char
+ *
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreChar(byte c) {
+ return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreCode(byte c) {
+ int pos = ((lettersInKmer - 1) % 4) << 1;
+ byte output = (byte) ((bytes[kmerStartOffset] >> pos) & 0x03);
+ for (int i = 0; i < bytesUsed - 1; i++) {
+ byte in = (byte) ((bytes[kmerStartOffset + i + 1] >> 6) & 0x03);
+ bytes[kmerStartOffset + i] = (byte) ((bytes[kmerStartOffset + i] << 2) | in);
+ }
+ bytes[kmerStartOffset + bytesUsed - 1] = (byte) ((bytes[kmerStartOffset + bytesUsed - 1] << 2) | c);
+ clearLeadBit();
+ return output;
+ }
+
+ public int getKmerLetterLength() {
+ return lettersInKmer;
+ }
+
+ @Override
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ /**
+ * Return the (hyracks-specific) data block offset. This includes the header.
+ */
+ public int getBlockOffset() {
+ return kmerStartOffset - HEADER_SIZE;
+ }
+
+ /**
+ * Return the data block offset where the kmer data begins. This excludes the header.
+ */
+ public int getKmerOffset() {
+ return kmerStartOffset;
+ }
+
+ /**
+ * Return the number of bytes used by both header and kmer chain
+ */
+ @Override
+ public int getLength() {
+ return bytesUsed + HEADER_SIZE;
+ }
+
+ /**
+ * Return the number of bytes used by the kmer chain
+ */
+ public int getKmerByteLength() {
+ return bytesUsed;
+ }
+
+
+ public void setKmerLength(int k) {
+ this.bytesUsed = KmerUtil.getByteNumFromK(k);
+ this.lettersInKmer = k;
+ saveHeader(k);
+ }
+
+ protected int getKmerByteCapacity() {
+ return storageMaxSize - HEADER_SIZE;
+ }
+
+ protected void setKmerByteCapacity(int new_cap) {
+ if (new_cap != getKmerByteCapacity()) {
+ byte[] new_data = new byte[new_cap + HEADER_SIZE];
+ if (new_cap < bytesUsed) {
+ bytesUsed = new_cap;
+ }
+ if (bytesUsed != 0) {
+ System.arraycopy(bytes, kmerStartOffset, new_data, HEADER_SIZE, bytesUsed);
+ }
+ bytes = new_data;
+ kmerStartOffset = HEADER_SIZE;
+ storageMaxSize = bytes.length;
+ }
+ }
+
+ private void saveHeader(int length) {
+ Marshal.putInt(length, bytes, kmerStartOffset - HEADER_SIZE);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ lettersInKmer = in.readInt();
+ bytesUsed = KmerUtil.getByteNumFromK(lettersInKmer);
+ if (lettersInKmer > 0) {
+ if (getKmerByteCapacity() < this.bytesUsed) {
+ this.bytes = new byte[this.bytesUsed + HEADER_SIZE];
+ this.kmerStartOffset = HEADER_SIZE;
+ storageMaxSize = bytes.length;
+ }
+ in.readFully(bytes, kmerStartOffset, bytesUsed);
+ }
+ }
+
+ /**
+ * write the entire byte array including the header
+ */
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.write(bytes, kmerStartOffset - HEADER_SIZE, bytesUsed + HEADER_SIZE);
+ }
+
+ @Override
+ public int hashCode() {
+ return Marshal.hashBytes(bytes, kmerStartOffset - HEADER_SIZE, bytesUsed + HEADER_SIZE);
+ }
+
+ @Override
+ public boolean equals(Object right_obj) {
+ if (right_obj instanceof VKmerBytesWritable) {
+ // since these may be backed by storage of different sizes, we have to manually check each byte, including the header
+ VKmerBytesWritable right = (VKmerBytesWritable) right_obj;
+ for (int i = -HEADER_SIZE; i < bytesUsed; i++) {
+ if (bytes[kmerStartOffset + i] != right.bytes[right.kmerStartOffset + i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return KmerUtil.recoverKmerFrom(this.lettersInKmer, bytes, kmerStartOffset, bytesUsed);
+ }
+
+ public static class Comparator extends WritableComparator {
+
+ public Comparator() {
+ super(VKmerBytesWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = Marshal.getInt(b1, s1);
+ int kmerlength2 = Marshal.getInt(b2, s2);
+ if (kmerlength1 == kmerlength2) {
+ return compareBytes(b1, s1 + HEADER_SIZE, l1 - HEADER_SIZE, b2, s2 + HEADER_SIZE, l2 - HEADER_SIZE);
+ }
+ return kmerlength1 - kmerlength2;
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(VKmerBytesWritable.class, new Comparator());
+ }
+
+ /**
+ * Ensures that there is space for at least `size` bytes of kmer (not
+ * including any header)
+ */
+ protected void setSize(int size) {
+ if (size > getKmerByteCapacity()) {
+ setKmerByteCapacity((size * 3 / 2));
+ }
+ this.bytesUsed = size;
+ }
+
+ public void setByRead(int k, byte[] stringBytes, int start) {
+ reset(k);
+ setByRead(stringBytes, start);
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param stringBytes
+ * @param start
+ */
+ public void setByRead(byte[] stringBytes, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = this.bytesUsed - 1;
+ for (int i = start; i < start + lettersInKmer && i < stringBytes.length; i++) {
+ byte code = GeneCode.getCodeFromSymbol(stringBytes[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[kmerStartOffset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[kmerStartOffset] = l;
+ }
+ }
+
+ public void setByReadReverse(int k, byte[] stringBytes, int start) {
+ reset(k);
+ setByReadReverse(stringBytes, start);
+ }
+
+ /**
+ * Compress Reversed read into bytes array e.g. AATAG will paired to CTATT,
+ * and then compress as [0x000T,0xTATC]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = bytesUsed - 1;
+ // for (int i = start + kmerlength - 1; i >= 0 && i < array.length; i--)
+ // {
+ for (int i = start + lettersInKmer - 1; i >= start && i < array.length; i--) {
+ byte code = GeneCode.getPairedCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[kmerStartOffset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[kmerStartOffset] = l;
+ }
+ }
+
+ /**
+ * Merge Kmer with the next connected Kmer e.g. AAGCTAA merge with AACAACC,
+ * if the initial kmerSize = 3 then it will return AAGCTAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param kmer
+ * : the next kmer
+ */
+ public void mergeWithFFKmer(int initialKmerSize, VKmerBytesWritable kmer) {
+ int preKmerLength = lettersInKmer;
+ int preSize = bytesUsed;
+ lettersInKmer += kmer.lettersInKmer - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(lettersInKmer));
+ for (int i = 1; i <= preSize; i++) {
+ bytes[kmerStartOffset + bytesUsed - i] = bytes[kmerStartOffset + preSize - i];
+ }
+ for (int k = initialKmerSize - 1; k < kmer.getKmerLetterLength(); k += 4) {
+ byte onebyte = KmerBytesWritable.getOneByteFromKmerAtPosition(k, kmer.bytes, kmer.kmerStartOffset,
+ kmer.bytesUsed);
+ KmerBytesWritable.appendOneByteAtPosition(preKmerLength + k - initialKmerSize + 1, onebyte, bytes,
+ kmerStartOffset, bytesUsed);
+ }
+ clearLeadBit();
+ saveHeader(lettersInKmer);
+ }
+
+ public void mergeWithFFKmer(int kmerSize, KmerBytesWritable kmer) {
+ // TODO make this more efficient
+ mergeWithFFKmer(kmerSize, new VKmerBytesWritable(kmer.toString()));
+ }
+
+ /**
+ * Merge Kmer with the next connected Kmer, when that Kmer needs to be
+ * reverse-complemented e.g. AAGCTAA merge with GGTTGTT, if the initial
+ * kmerSize = 3 then it will return AAGCTAACAACC A merge B => A B~
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param kmer
+ * : the next kmer
+ */
+ public void mergeWithFRKmer(int initialKmerSize, VKmerBytesWritable kmer) {
+ int preSize = bytesUsed;
+ int preKmerLength = lettersInKmer;
+ lettersInKmer += kmer.lettersInKmer - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(lettersInKmer));
+ // copy prefix into right-side of buffer
+ for (int i = 1; i <= preSize; i++) {
+ bytes[kmerStartOffset + bytesUsed - i] = bytes[kmerStartOffset + preSize - i];
+ }
+
+ int bytecount = (preKmerLength % 4) * 2;
+ int bcount = bytesUsed - preSize - bytecount / 8; // may overlap
+ // previous kmer
+ byte l = bcount == bytesUsed - preSize ? bytes[kmerStartOffset + bcount] : 0x00;
+ bytecount %= 8;
+ for (int i = kmer.lettersInKmer - initialKmerSize; i >= 0; i--) {
+ byte code = GeneCode.getPairedGeneCode(kmer.getGeneCodeAtPosition(i));
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[kmerStartOffset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[kmerStartOffset] = l;
+ }
+ saveHeader(lettersInKmer);
+ }
+
+ public void mergeWithFRKmer(int kmerSize, KmerBytesWritable kmer) {
+ // TODO make this more efficient
+ mergeWithFRKmer(kmerSize, new VKmerBytesWritable(kmer.toString()));
+ }
+
+ /**
+ * Merge Kmer with the previous connected Kmer, when that kmer needs to be
+ * reverse-complemented e.g. AACAACC merge with TTCTGCC, if the initial
+ * kmerSize = 3 then it will return GGCAGAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param preKmer
+ * : the previous kmer
+ */
+ public void mergeWithRFKmer(int initialKmerSize, VKmerBytesWritable preKmer) {
+ // TODO make this more efficient
+ VKmerBytesWritable reversed = new VKmerBytesWritable(preKmer.lettersInKmer);
+ reversed.setByReadReverse(preKmer.toString().getBytes(), 0);
+ mergeWithRRKmer(initialKmerSize, reversed);
+ }
+
+ public void mergeWithRFKmer(int kmerSize, KmerBytesWritable kmer) {
+ // TODO make this more efficient
+ mergeWithRFKmer(kmerSize, new VKmerBytesWritable(kmer.toString()));
+ }
+
+ /**
+ * Merge Kmer with the previous connected Kmer e.g. AACAACC merge with
+ * AAGCTAA, if the initial kmerSize = 3 then it will return AAGCTAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param preKmer
+ * : the previous kmer
+ */
+ public void mergeWithRRKmer(int initialKmerSize, VKmerBytesWritable preKmer) {
+ int preKmerLength = lettersInKmer;
+ int preSize = bytesUsed;
+ lettersInKmer += preKmer.lettersInKmer - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(lettersInKmer));
+ byte cacheByte = KmerBytesWritable.getOneByteFromKmerAtPosition(0, bytes, kmerStartOffset, preSize);
+
+ // copy prekmer
+ for (int k = 0; k < preKmer.lettersInKmer - initialKmerSize + 1; k += 4) {
+ byte onebyte = KmerBytesWritable.getOneByteFromKmerAtPosition(k, preKmer.bytes, preKmer.kmerStartOffset,
+ preKmer.bytesUsed);
+ KmerBytesWritable.appendOneByteAtPosition(k, onebyte, bytes, kmerStartOffset, bytesUsed);
+ }
+
+ // copy current kmer
+ int k = 4;
+ for (; k < preKmerLength; k += 4) {
+ byte onebyte = KmerBytesWritable.getOneByteFromKmerAtPosition(k, bytes, kmerStartOffset, preSize);
+ KmerBytesWritable.appendOneByteAtPosition(preKmer.lettersInKmer - initialKmerSize + k - 4 + 1, cacheByte,
+ bytes, kmerStartOffset, bytesUsed);
+ cacheByte = onebyte;
+ }
+ KmerBytesWritable.appendOneByteAtPosition(preKmer.lettersInKmer - initialKmerSize + k - 4 + 1, cacheByte,
+ bytes, kmerStartOffset, bytesUsed);
+ clearLeadBit();
+ }
+
+ public void mergeWithRRKmer(int kmerSize, KmerBytesWritable kmer) {
+ // TODO make this more efficient
+ mergeWithRRKmer(kmerSize, new VKmerBytesWritable(kmer.toString()));
+ }
+
+ public void mergeWithKmerInDir(byte dir, int initialKmerSize, VKmerBytesWritable kmer) {
+ switch (dir & DirectionFlag.DIR_MASK) {
+ case DirectionFlag.DIR_FF:
+ mergeWithFFKmer(initialKmerSize, kmer);
+ break;
+ case DirectionFlag.DIR_FR:
+ mergeWithFRKmer(initialKmerSize, kmer);
+ break;
+ case DirectionFlag.DIR_RF:
+ mergeWithRFKmer(initialKmerSize, kmer);
+ break;
+ case DirectionFlag.DIR_RR:
+ mergeWithRRKmer(initialKmerSize, kmer);
+ break;
+ default:
+ throw new RuntimeException("Direction not recognized: " + dir);
+ }
+ }
+ public void mergeWithKmerInDir(byte dir, int initialKmerSize, KmerBytesWritable kmer) {
+ // TODO make this more efficient
+ mergeWithKmerInDir(dir, initialKmerSize, new VKmerBytesWritable(kmer.toString()));
+ }
+
+ public KmerBytesWritable asFixedLengthKmer() {
+ if (lettersInKmer != KmerBytesWritable.getKmerLength()) {
+ throw new IllegalArgumentException("VKmer " + this.toString() + " is not of the same length as the fixed length Kmer (" + KmerBytesWritable.getKmerLength() + " )!");
+ }
+ return new KmerBytesWritable(bytes, kmerStartOffset);
+ }
+
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/GeneCode.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/GeneCode.java
new file mode 100644
index 0000000..c3d8a98
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/GeneCode.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.velvet.oldtype;
+
+public class GeneCode {
+ public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
+ /**
+ * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL
+ * }
+ */
+ public static final byte A = 0;
+ public static final byte C = 1;
+ public static final byte G = 2;
+ public static final byte T = 3;
+
+ public static byte getCodeFromSymbol(byte ch) {
+ byte r = 0;
+ switch (ch) {
+ case 'A':
+ case 'a':
+ r = A;
+ break;
+ case 'C':
+ case 'c':
+ r = C;
+ break;
+ case 'G':
+ case 'g':
+ r = G;
+ break;
+ case 'T':
+ case 't':
+ r = T;
+ break;
+ }
+ return r;
+ }
+
+ public static byte getPairedGeneCode(byte genecode){
+ if ( genecode < 0 || genecode > 3){
+ throw new IllegalArgumentException("Invalid genecode");
+ }
+ return (byte) (3- genecode);
+ }
+
+ public static byte getPairedCodeFromSymbol(byte ch){
+ return getPairedGeneCode(getCodeFromSymbol(ch));
+ }
+
+ public static byte getSymbolFromCode(byte code) {
+ if (code > 3 || code < 0 ) {
+ throw new IllegalArgumentException("Invalid genecode");
+ }
+ return GENE_SYMBOL[code];
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/KmerBytesWritable.java
new file mode 100644
index 0000000..630dbad
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/KmerBytesWritable.java
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.velvet.oldtype;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+import edu.uci.ics.genomix.data.KmerUtil;
+
+/**
+ * Variable kmer length byteswritable
+ * It was used to generate the graph in which phase the kmer length doesn't change.
+ * Thus the size of bytes doesn't change either.
+ */
+public class KmerBytesWritable extends BinaryComparable implements Serializable, WritableComparable<BinaryComparable> {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ private static final byte[] EMPTY_BYTES = {};
+
+ protected int size;
+ protected byte[] bytes;
+ protected int offset;
+ protected int kmerlength;
+
+ public KmerBytesWritable() {
+ this(0, EMPTY_BYTES, 0);
+ }
+
+ public KmerBytesWritable(int k, byte[] storage, int offset) {
+ setNewReference(k, storage, offset);
+ }
+
+ public KmerBytesWritable(int k, String kmer) {
+ setNewReference(kmer.length(), kmer.getBytes(), 0);
+ }
+
+ /**
+ * Initial Kmer space by kmerlength
+ *
+ * @param k
+ * kmerlength
+ */
+ public KmerBytesWritable(int k) {
+ this.kmerlength = k;
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ if (k > 0) {
+ this.bytes = new byte[this.size];
+ } else {
+ this.bytes = EMPTY_BYTES;
+ }
+ this.offset = 0;
+ }
+
+ public KmerBytesWritable(KmerBytesWritable right) {
+ this(right.kmerlength);
+ set(right);
+ }
+
+ /**
+ * Deep copy of the given kmer
+ *
+ * @param newData
+ */
+ public void set(KmerBytesWritable newData) {
+ if (newData == null) {
+ this.set(0, EMPTY_BYTES, 0);
+ } else {
+ this.set(newData.kmerlength, newData.bytes, newData.getOffset());
+ }
+ }
+
+ /**
+ * Deep copy of the given bytes data
+ * It will not change the kmerlength
+ *
+ * @param newData
+ * @param offset
+ */
+ public void set(byte[] newData, int offset) {
+ if (kmerlength > 0) {
+ System.arraycopy(newData, offset, bytes, this.offset, size);
+ }
+ }
+
+ /**
+ * Deep copy of the given data, and also set to new kmerlength
+ *
+ * @param k
+ * : new kmer length
+ * @param newData
+ * : data storage
+ * @param offset
+ * : start offset
+ */
+ public void set(int k, byte[] newData, int offset) {
+ reset(k);
+ if (k > 0) {
+ System.arraycopy(newData, offset, bytes, this.offset, size);
+ }
+ }
+
+ /**
+ * Reset array by kmerlength
+ *
+ * @param k
+ */
+ public void reset(int k) {
+ this.kmerlength = k;
+ setSize(KmerUtil.getByteNumFromK(k));
+ clearLeadBit();
+ }
+
+ /**
+ * Point this datablock to the given bytes array
+ * It works like the pointer to new datablock.
+ * kmerlength will not change
+ *
+ * @param newData
+ * @param offset
+ */
+ public void setNewReference(byte[] newData, int offset) {
+ this.bytes = newData;
+ this.offset = offset;
+ if (newData.length - offset < size) {
+ throw new IllegalArgumentException("Not given enough space");
+ }
+ }
+
+ /**
+ * Point this datablock to the given bytes array
+ * It works like the pointer to new datablock.
+ * It also set the new kmerlength
+ *
+ * @param k
+ * @param newData
+ * @param offset
+ */
+ public void setNewReference(int k, byte[] newData, int offset) {
+ this.kmerlength = k;
+ this.size = KmerUtil.getByteNumFromK(k);
+ setNewReference(newData, offset);
+ }
+
+ protected void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity((size * 3 / 2));
+ }
+ this.size = size;
+ }
+
+ protected int getCapacity() {
+ return bytes.length;
+ }
+
+ protected void setCapacity(int new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < size) {
+ size = new_cap;
+ }
+ if (size != 0) {
+ System.arraycopy(bytes, offset, new_data, 0, size);
+ }
+ bytes = new_data;
+ offset = 0;
+ }
+ }
+
+ /**
+ * Get one genecode (A|G|C|T) from the given kmer index
+ * e.g. Get the 4th gene of the kmer ACGTA will return T
+ *
+ * @param pos
+ * @return
+ */
+ public byte getGeneCodeAtPosition(int pos) {
+ if (pos >= kmerlength) {
+ throw new IllegalArgumentException("gene position out of bound");
+ }
+ int posByte = pos / 4;
+ int shift = (pos % 4) << 1;
+ return (byte) ((bytes[offset + size - 1 - posByte] >> shift) & 0x3);
+ }
+
+ public int getKmerLength() {
+ return this.kmerlength;
+ }
+
+ @Override
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ public int getOffset() {
+ return offset;
+ }
+
+ @Override
+ public int getLength() {
+ return size;
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = this.size - 1;
+ for (int i = start; i < start + kmerlength && i < array.length; i++) {
+ byte code = GeneCode.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[offset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[offset] = l;
+ }
+ }
+
+ public void setByRead(int k, byte[] array, int start) {
+ reset(k);
+ setByRead(array, start);
+ }
+
+ /**
+ * Compress Reversed read into bytes array
+ * e.g. AATAG will paired to CTATT, and then compress as
+ * [0x000T,0xTATC]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = size - 1;
+ for (int i = start + kmerlength - 1; i >= 0 && i < array.length; i--) {
+ byte code = GeneCode.getPairedCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[offset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[offset] = l;
+ }
+ }
+
+ public void setByReadReverse(int k, byte[] array, int start) {
+ reset(k);
+ setByReadReverse(array, start);
+ }
+
+ /**
+ * Shift Kmer to accept new char input
+ *
+ * @param c
+ * Input new gene character
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextChar(byte c) {
+ return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextCode(byte c) {
+ byte output = (byte) (bytes[offset + size - 1] & 0x03);
+ for (int i = size - 1; i > 0; i--) {
+ byte in = (byte) (bytes[offset + i - 1] & 0x03);
+ bytes[offset + i] = (byte) (((bytes[offset + i] >>> 2) & 0x3f) | (in << 6));
+ }
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte code = (byte) (c << pos);
+ bytes[offset] = (byte) (((bytes[offset] >>> 2) & 0x3f) | code);
+ clearLeadBit();
+ return output;
+ }
+
+ /**
+ * Shift Kmer to accept new input char
+ *
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreChar(byte c) {
+ return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreCode(byte c) {
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte output = (byte) ((bytes[offset] >> pos) & 0x03);
+ for (int i = 0; i < size - 1; i++) {
+ byte in = (byte) ((bytes[offset + i + 1] >> 6) & 0x03);
+ bytes[offset + i] = (byte) ((bytes[offset + i] << 2) | in);
+ }
+ bytes[offset + size - 1] = (byte) ((bytes[offset + size - 1] << 2) | c);
+ clearLeadBit();
+ return output;
+ }
+
+ /**
+ * Merge Kmer with the next connected Kmer
+ * e.g. AAGCTAA merge with AACAACC, if the initial kmerSize = 3
+ * then it will return AAGCTAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param kmer
+ * : the next kmer
+ */
+ public void mergeNextKmer(int initialKmerSize, KmerBytesWritable kmer) {
+ int preKmerLength = kmerlength;
+ int preSize = size;
+ this.kmerlength += kmer.kmerlength - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(kmerlength));
+ for (int i = 1; i <= preSize; i++) {
+ bytes[offset + size - i] = bytes[offset + preSize - i];
+ }
+ for (int k = initialKmerSize - 1; k < kmer.getKmerLength(); k += 4) {
+ byte onebyte = getOneByteFromKmerAtPosition(k, kmer.getBytes(), kmer.getOffset(), kmer.getLength());
+ appendOneByteAtPosition(preKmerLength + k - initialKmerSize + 1, onebyte, bytes, offset, size);
+ }
+ clearLeadBit();
+ }
+
+ /**
+ * Merge Kmer with the previous connected Kmer
+ * e.g. AACAACC merge with AAGCTAA, if the initial kmerSize = 3
+ * then it will return AAGCTAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param preKmer
+ * : the previous kmer
+ */
+ public void mergePreKmer(int initialKmerSize, KmerBytesWritable preKmer) {
+ int preKmerLength = kmerlength;
+ int preSize = size;
+ this.kmerlength += preKmer.kmerlength - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(kmerlength));
+ byte cacheByte = getOneByteFromKmerAtPosition(0, bytes, offset, preSize);
+
+ // copy prekmer
+ for (int k = 0; k < preKmer.kmerlength - initialKmerSize + 1; k += 4) {
+ byte onebyte = getOneByteFromKmerAtPosition(k, preKmer.bytes, preKmer.offset, preKmer.size);
+ appendOneByteAtPosition(k, onebyte, bytes, offset, size);
+ }
+
+ // copy current kmer
+ int k = 4;
+ for (; k < preKmerLength; k += 4) {
+ byte onebyte = getOneByteFromKmerAtPosition(k, bytes, offset, preSize);
+ appendOneByteAtPosition(preKmer.kmerlength - initialKmerSize + k - 4 + 1, cacheByte, bytes, offset, size);
+ cacheByte = onebyte;
+ }
+ appendOneByteAtPosition(preKmer.kmerlength - initialKmerSize + k - 4 + 1, cacheByte, bytes, offset, size);
+ clearLeadBit();
+ }
+
+ public static void appendOneByteAtPosition(int k, byte onebyte, byte[] buffer, int start, int length) {
+ int position = start + length - 1 - k / 4;
+ if (position < start) {
+ throw new IllegalArgumentException("Buffer for kmer storage is invalid");
+ }
+ int shift = ((k) % 4) << 1;
+ int mask = shift == 0 ? 0 : ((1 << shift) - 1);
+
+ buffer[position] = (byte) ((buffer[position] & mask) | ((0xff & onebyte) << shift));
+ if (position > start && shift != 0) {
+ buffer[position - 1] = (byte) ((buffer[position - 1] & (0xff - mask)) | ((byte) ((0xff & onebyte) >> (8 - shift))));
+ }
+ }
+
+ public static byte getOneByteFromKmerAtPosition(int k, byte[] buffer, int start, int length) {
+ int position = start + length - 1 - k / 4;
+ if (position < start) {
+ throw new IllegalArgumentException("Buffer of kmer storage is invalid");
+ }
+ int shift = (k % 4) << 1;
+ byte data = (byte) (((0xff) & buffer[position]) >> shift);
+ if (shift != 0 && position > start) {
+ data |= 0xff & (buffer[position - 1] << (8 - shift));
+ }
+ return data;
+ }
+
+ protected void clearLeadBit() {
+ if (kmerlength % 4 != 0) {
+ bytes[offset] &= (1 << ((kmerlength % 4) << 1)) - 1;
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.kmerlength = in.readInt();
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ if (this.kmerlength > 0) {
+ if (this.bytes.length < this.size) {
+ this.bytes = new byte[this.size];
+ this.offset = 0;
+ }
+ in.readFully(bytes, offset, size);
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(kmerlength);
+ if (kmerlength > 0) {
+ out.write(bytes, offset, size);
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return super.hashCode() * 31 + this.kmerlength;
+ }
+
+ @Override
+ public boolean equals(Object right_obj) {
+ if (right_obj instanceof KmerBytesWritable)
+ return this.kmerlength == ((KmerBytesWritable) right_obj).kmerlength && super.equals(right_obj);
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return KmerUtil.recoverKmerFrom(this.kmerlength, this.getBytes(), offset, this.getLength());
+ }
+
+ public static class Comparator extends WritableComparator {
+ public final int LEAD_BYTES = 4;
+
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = readInt(b1, s1);
+ int kmerlength2 = readInt(b2, s2);
+ if (kmerlength1 == kmerlength2) {
+ return compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
+ }
+ return kmerlength1 - kmerlength2;
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
+
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/KmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/KmerBytesWritableFactory.java
new file mode 100644
index 0000000..b0aaebc
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/KmerBytesWritableFactory.java
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.velvet.oldtype;
+
+public class KmerBytesWritableFactory {
+ private KmerBytesWritable kmer;
+
+ public KmerBytesWritableFactory(int k) {
+ kmer = new KmerBytesWritable(k);
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public KmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
+ kmer.reset(k);
+ kmer.setByRead(array, start);
+ return kmer;
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param array
+ * @param start
+ */
+ public KmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
+ kmer.reset(k);
+ kmer.setByReadReverse(array, start);
+ return kmer;
+ }
+
+ /**
+ * Get last kmer from kmer-chain.
+ * e.g. kmerChain is AAGCTA, if k =5, it will
+ * return AGCTA
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return LastKmer bytes array
+ */
+ public KmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
+ if (lastK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (lastK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(lastK);
+
+ /** from end to start */
+ int byteInChain = kmerChain.getLength() - 1 - (kmerChain.getKmerLength() - lastK) / 4;
+ int posInByteOfChain = ((kmerChain.getKmerLength() - lastK) % 4) << 1; // *2
+ int byteInKmer = kmer.getLength() - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ /**
+ * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
+ * return AAGCT
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return FirstKmer bytes array
+ */
+ public KmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
+ if (firstK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (firstK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(firstK);
+
+ int i = 1;
+ for (; i < kmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ }
+ int posInByteOfChain = (firstK % 4) << 1; // *2
+ if (posInByteOfChain == 0) {
+ kmer.getBytes()[0] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ } else {
+ kmer.getBytes()[0] = (byte) (kmerChain.getBytes()[kmerChain.getLength() - i] & ((1 << posInByteOfChain) - 1));
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ public KmerBytesWritable getSubKmerFromChain(int startK, int kSize, final KmerBytesWritable kmerChain) {
+ if (startK + kSize > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (startK == 0 && kSize == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(kSize);
+
+ /** from end to start */
+ int byteInChain = kmerChain.getLength() - 1 - startK / 4;
+ int posInByteOfChain = startK % 4 << 1; // *2
+ int byteInKmer = kmer.getLength() - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ /**
+ * Merge kmer with next neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAGCTA
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param nextCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer, this K of this Kmer is k+1
+ */
+ public KmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
+ this.kmer.reset(kmer.getKmerLength() + 1);
+ for (int i = 1; i <= kmer.getLength(); i++) {
+ this.kmer.getBytes()[this.kmer.getLength() - i] = kmer.getBytes()[kmer.getLength() - i];
+ }
+ if (this.kmer.getLength() > kmer.getLength()) {
+ this.kmer.getBytes()[0] = (byte) (nextCode & 0x3);
+ } else {
+ this.kmer.getBytes()[0] = (byte) (kmer.getBytes()[0] | ((nextCode & 0x3) << ((kmer.getKmerLength() % 4) << 1)));
+ }
+ this.kmer.clearLeadBit();
+ return this.kmer;
+ }
+
+ /**
+ * Merge kmer with previous neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAAGCT
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param preCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer,this K of this Kmer is k+1
+ */
+ public KmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ this.kmer.reset(kmer.getKmerLength() + 1);
+ int byteInMergedKmer = 0;
+ if (kmer.getKmerLength() % 4 == 0) {
+ this.kmer.getBytes()[0] = (byte) ((kmer.getBytes()[0] >> 6) & 0x3);
+ byteInMergedKmer++;
+ }
+ for (int i = 0; i < kmer.getLength() - 1; i++, byteInMergedKmer++) {
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[i] << 2) | ((kmer.getBytes()[i + 1] >> 6) & 0x3));
+ }
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[kmer.getLength() - 1] << 2) | (preCode & 0x3));
+ this.kmer.clearLeadBit();
+ return this.kmer;
+ }
+
+ /**
+ * Merge two kmer to one kmer
+ * e.g. ACTA + ACCGT => ACTAACCGT
+ *
+ * @param preK
+ * : previous k of kmer
+ * @param kmerPre
+ * : bytes array of previous kmer
+ * @param nextK
+ * : next k of kmer
+ * @param kmerNext
+ * : bytes array of next kmer
+ * @return merged kmer, the new k is @preK + @nextK
+ */
+ public KmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
+ kmer.reset(preKmer.getKmerLength() + nextKmer.getKmerLength());
+ int i = 1;
+ for (; i <= preKmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = preKmer.getBytes()[preKmer.getLength() - i];
+ }
+ if (i > 1) {
+ i--;
+ }
+ if (preKmer.getKmerLength() % 4 == 0) {
+ for (int j = 1; j <= nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = nextKmer.getBytes()[nextKmer.getLength() - j];
+ }
+ } else {
+ int posNeedToMove = ((preKmer.getKmerLength() % 4) << 1);
+ kmer.getBytes()[kmer.getLength() - i] |= nextKmer.getBytes()[nextKmer.getLength() - 1] << posNeedToMove;
+ for (int j = 1; j < nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = (byte) (((nextKmer.getBytes()[nextKmer.getLength() - j] & 0xff) >> (8 - posNeedToMove)) | (nextKmer
+ .getBytes()[nextKmer.getLength() - j - 1] << posNeedToMove));
+ }
+ if (nextKmer.getKmerLength() % 4 == 0 || (nextKmer.getKmerLength() % 4) * 2 + posNeedToMove > 8) {
+ kmer.getBytes()[0] = (byte) ((0xff & nextKmer.getBytes()[0]) >> (8 - posNeedToMove));
+ }
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ /**
+ * Safely shifted the kmer forward without change the input kmer
+ * e.g. AGCGC shift with T => GCGCT
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param afterCode
+ * : input genecode
+ * @return new created kmer that shifted by afterCode, the K will not change
+ */
+ public KmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode) {
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithNextCode(afterCode);
+ return this.kmer;
+ }
+
+ /**
+ * Safely shifted the kmer backward without change the input kmer
+ * e.g. AGCGC shift with T => TAGCG
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param preCode
+ * : input genecode
+ * @return new created kmer that shifted by preCode, the K will not change
+ */
+ public KmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithPreCode(preCode);
+ return this.kmer;
+ }
+
+ /**
+ * get the reverse sequence of given kmer
+ *
+ * @param kmer
+ */
+ public KmerBytesWritable reverse(final KmerBytesWritable kmer) {
+ this.kmer.reset(kmer.getKmerLength());
+
+ int curPosAtKmer = ((kmer.getKmerLength() - 1) % 4) << 1;
+ int curByteAtKmer = 0;
+
+ int curPosAtReverse = 0;
+ int curByteAtReverse = this.kmer.getLength() - 1;
+ this.kmer.getBytes()[curByteAtReverse] = 0;
+ for (int i = 0; i < kmer.getKmerLength(); i++) {
+ byte gene = (byte) ((kmer.getBytes()[curByteAtKmer] >> curPosAtKmer) & 0x03);
+ this.kmer.getBytes()[curByteAtReverse] |= gene << curPosAtReverse;
+ curPosAtReverse += 2;
+ if (curPosAtReverse >= 8) {
+ curPosAtReverse = 0;
+ this.kmer.getBytes()[--curByteAtReverse] = 0;
+ }
+ curPosAtKmer -= 2;
+ if (curPosAtKmer < 0) {
+ curPosAtKmer = 6;
+ curByteAtKmer++;
+ }
+ }
+ this.kmer.clearLeadBit();
+ return this.kmer;
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/oldtype/NodeWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/NodeWritable.java
similarity index 77%
rename from genomix/genomix-data/src/main/java/edu/uci/ics/genomix/oldtype/NodeWritable.java
rename to genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/NodeWritable.java
index 9fc1829..128bf9f 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/oldtype/NodeWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/NodeWritable.java
@@ -13,7 +13,7 @@
* limitations under the License.
*/
-package edu.uci.ics.genomix.oldtype;
+package edu.uci.ics.genomix.velvet.oldtype;
import java.io.DataInput;
import java.io.DataOutput;
@@ -22,24 +22,11 @@
import org.apache.hadoop.io.WritableComparable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
-
public class NodeWritable implements WritableComparable<NodeWritable>, Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
- public static final NodeWritable EMPTY_NODE = new NodeWritable(0);
-
- // merge/update directions
- public static class DirectionFlag {
- public static final byte DIR_FF = 0b00 << 0;
- public static final byte DIR_FR = 0b01 << 0;
- public static final byte DIR_RF = 0b10 << 0;
- public static final byte DIR_RR = 0b11 << 0;
- public static final byte DIR_MASK = 0b11 << 0;
- }
-
private PositionWritable nodeID;
private PositionListWritable forwardForwardList;
private PositionListWritable forwardReverseList;
@@ -71,16 +58,6 @@
kmer.set(kmer);
}
- public void set(PositionWritable nodeID, PositionListWritable FFList, PositionListWritable FRList,
- PositionListWritable RFList, PositionListWritable RRList, KmerBytesWritable kmer) {
- this.nodeID.set(nodeID);
- this.forwardForwardList.set(FFList);
- this.forwardReverseList.set(FRList);
- this.reverseForwardList.set(RFList);
- this.reverseReverseList.set(RRList);
- this.kmer.set(kmer);
- }
-
public void setNodeID(PositionWritable ref) {
this.setNodeID(ref.getReadID(), ref.getPosInRead());
}
@@ -118,21 +95,6 @@
return reverseReverseList;
}
- public PositionListWritable getListFromDir(byte dir) {
- switch (dir & DirectionFlag.DIR_MASK) {
- case DirectionFlag.DIR_FF:
- return getFFList();
- case DirectionFlag.DIR_FR:
- return getFRList();
- case DirectionFlag.DIR_RF:
- return getRFList();
- case DirectionFlag.DIR_RR:
- return getRRList();
- default:
- throw new RuntimeException("Unrecognized direction in getListFromDir: " + dir);
- }
- }
-
public PositionWritable getNodeID() {
return nodeID;
}
@@ -148,13 +110,13 @@
public void mergeForwardNext(NodeWritable nextNode, int initialKmerSize) {
this.forwardForwardList.set(nextNode.forwardForwardList);
this.forwardReverseList.set(nextNode.forwardReverseList);
- kmer.mergeWithFFKmer(initialKmerSize, nextNode.getKmer());
+ kmer.mergeNextKmer(initialKmerSize, nextNode.getKmer());
}
public void mergeForwardPre(NodeWritable preNode, int initialKmerSize) {
this.reverseForwardList.set(preNode.reverseForwardList);
this.reverseReverseList.set(preNode.reverseReverseList);
- kmer.mergeWithRRKmer(initialKmerSize, preNode.getKmer());
+ kmer.mergePreKmer(initialKmerSize, preNode.getKmer());
}
public void set(NodeWritable node) {
@@ -211,13 +173,13 @@
@Override
public String toString() {
StringBuilder sbuilder = new StringBuilder();
- sbuilder.append('{');
+ sbuilder.append('(');
sbuilder.append(nodeID.toString()).append('\t');
sbuilder.append(forwardForwardList.toString()).append('\t');
sbuilder.append(forwardReverseList.toString()).append('\t');
sbuilder.append(reverseForwardList.toString()).append('\t');
sbuilder.append(reverseReverseList.toString()).append('\t');
- sbuilder.append(kmer.toString()).append('}');
+ sbuilder.append(kmer.toString()).append(')');
return sbuilder.toString();
}
@@ -236,8 +198,4 @@
return inDegree() == 1 && outDegree() == 1;
}
- public boolean isSimpleOrTerminalPath() {
- return isPathNode() || (inDegree() == 0 && outDegree() == 1) || (inDegree() == 1 && outDegree() == 0);
- }
-
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/PositionListWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/PositionListWritable.java
new file mode 100644
index 0000000..b6c42c2
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/PositionListWritable.java
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.velvet.oldtype;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+
+import edu.uci.ics.genomix.data.Marshal;
+
+public class PositionListWritable implements Writable, Iterable<PositionWritable>, Serializable {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ protected byte[] storage;
+ protected int offset;
+ protected int valueCount;
+ protected static final byte[] EMPTY = {};
+ public static final int INTBYTES = 4;
+
+ protected PositionWritable posIter = new PositionWritable();
+
+ public PositionListWritable() {
+ this.storage = EMPTY;
+ this.valueCount = 0;
+ this.offset = 0;
+ }
+
+ public PositionListWritable(int count, byte[] data, int offset) {
+ setNewReference(count, data, offset);
+ }
+
+ public PositionListWritable(List<PositionWritable> posns) {
+ this();
+ for (PositionWritable p : posns) {
+ append(p);
+ }
+ }
+
+ public void setNewReference(int count, byte[] data, int offset) {
+ this.valueCount = count;
+ this.storage = data;
+ this.offset = offset;
+ }
+
+ protected void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity((size * 3 / 2));
+ }
+ }
+
+ protected int getCapacity() {
+ return storage.length - offset;
+ }
+
+ protected void setCapacity(int new_cap) {
+ if (new_cap > getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (storage.length - offset > 0) {
+ System.arraycopy(storage, offset, new_data, 0, storage.length - offset);
+ }
+ storage = new_data;
+ offset = 0;
+ }
+ }
+
+ public PositionWritable getPosition(int i) {
+ if (i >= valueCount) {
+ throw new ArrayIndexOutOfBoundsException("No such positions");
+ }
+ posIter.setNewReference(storage, offset + i * PositionWritable.LENGTH);
+ return posIter;
+ }
+
+ public void resetPosition(int i, int readID, byte posInRead) {
+ if (i >= valueCount) {
+ throw new ArrayIndexOutOfBoundsException("No such positions");
+ }
+ Marshal.putInt(readID, storage, offset + i * PositionWritable.LENGTH);
+ storage[offset + INTBYTES] = posInRead;
+ }
+
+ @Override
+ public Iterator<PositionWritable> iterator() {
+ Iterator<PositionWritable> it = new Iterator<PositionWritable>() {
+
+ private int currentIndex = 0;
+
+ @Override
+ public boolean hasNext() {
+ return currentIndex < valueCount;
+ }
+
+ @Override
+ public PositionWritable next() {
+ return getPosition(currentIndex++);
+ }
+
+ @Override
+ public void remove() {
+ }
+ };
+ return it;
+ }
+
+ public void set(PositionListWritable list2) {
+ set(list2.valueCount, list2.storage, list2.offset);
+ }
+
+ public void set(int valueCount, byte[] newData, int offset) {
+ this.valueCount = valueCount;
+ setSize(valueCount * PositionWritable.LENGTH);
+ if (valueCount > 0) {
+ System.arraycopy(newData, offset, storage, this.offset, valueCount * PositionWritable.LENGTH);
+ }
+ }
+
+ public void reset() {
+ valueCount = 0;
+ }
+
+ public void append(PositionWritable pos) {
+ setSize((1 + valueCount) * PositionWritable.LENGTH);
+ System.arraycopy(pos.getByteArray(), pos.getStartOffset(), storage, offset + valueCount
+ * PositionWritable.LENGTH, pos.getLength());
+ valueCount += 1;
+ }
+
+ public void append(int readID, byte posInRead) {
+ setSize((1 + valueCount) * PositionWritable.LENGTH);
+ Marshal.putInt(readID, storage, offset + valueCount * PositionWritable.LENGTH);
+ storage[offset + valueCount * PositionWritable.LENGTH + PositionWritable.INTBYTES] = posInRead;
+ valueCount += 1;
+ }
+
+ public static int getCountByDataLength(int length) {
+ if (length % PositionWritable.LENGTH != 0) {
+ for (StackTraceElement ste : Thread.currentThread().getStackTrace()) {
+ System.out.println(ste);
+ }
+ throw new IllegalArgumentException("Length of positionlist is invalid");
+ }
+ return length / PositionWritable.LENGTH;
+ }
+
+ public int getCountOfPosition() {
+ return valueCount;
+ }
+
+ public byte[] getByteArray() {
+ return storage;
+ }
+
+ public int getStartOffset() {
+ return offset;
+ }
+
+ public int getLength() {
+ return valueCount * PositionWritable.LENGTH;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.valueCount = in.readInt();
+ setSize(valueCount * PositionWritable.LENGTH);
+ in.readFully(storage, offset, valueCount * PositionWritable.LENGTH);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(valueCount);
+ out.write(storage, offset, valueCount * PositionWritable.LENGTH);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sbuilder = new StringBuilder();
+ sbuilder.append('[');
+ for (PositionWritable pos : this) {
+ sbuilder.append(pos.toString());
+ sbuilder.append(',');
+ }
+ if (valueCount > 0) {
+ sbuilder.setCharAt(sbuilder.length() - 1, ']');
+ } else {
+ sbuilder.append(']');
+ }
+ return sbuilder.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return Marshal.hashBytes(getByteArray(), getStartOffset(), getLength());
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof PositionListWritable))
+ return false;
+ PositionListWritable other = (PositionListWritable) o;
+ if (this.valueCount != other.valueCount)
+ return false;
+ for (int i=0; i < this.valueCount; i++) {
+ if (!this.getPosition(i).equals(other.getPosition(i)))
+ return false;
+ }
+ return true;
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/PositionWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/PositionWritable.java
new file mode 100644
index 0000000..1d509bb
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/velvet/oldtype/PositionWritable.java
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.velvet.oldtype;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+import edu.uci.ics.genomix.data.Marshal;
+
+public class PositionWritable implements WritableComparable<PositionWritable>, Serializable {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ protected byte[] storage;
+ protected int offset;
+ public static final int LENGTH = 5;
+ public static final int INTBYTES = 4;
+
+ public PositionWritable() {
+ storage = new byte[LENGTH];
+ offset = 0;
+ }
+
+ public PositionWritable(int readID, byte posInRead) {
+ this();
+ set(readID, posInRead);
+ }
+
+ public PositionWritable(byte[] storage, int offset) {
+ setNewReference(storage, offset);
+ }
+
+ public void setNewReference(byte[] storage, int offset) {
+ this.storage = storage;
+ this.offset = offset;
+ }
+
+ public void set(PositionWritable pos) {
+ set(pos.getReadID(), pos.getPosInRead());
+ }
+
+ public void set(int readID, byte posInRead) {
+ Marshal.putInt(readID, storage, offset);
+ storage[offset + INTBYTES] = posInRead;
+ }
+
+ public int getReadID() {
+ return Marshal.getInt(storage, offset);
+ }
+
+ public byte getPosInRead() {
+ return storage[offset + INTBYTES];
+ }
+
+ public byte[] getByteArray() {
+ return storage;
+ }
+
+ public int getStartOffset() {
+ return offset;
+ }
+
+ public int getLength() {
+ return LENGTH;
+ }
+
+ public boolean isSameReadID(PositionWritable other) {
+ return getReadID() == other.getReadID();
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ in.readFully(storage, offset, LENGTH);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.write(storage, offset, LENGTH);
+ }
+
+ @Override
+ public int hashCode() {
+ return Marshal.hashBytes(getByteArray(), getStartOffset(), getLength());
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof PositionWritable))
+ return false;
+ PositionWritable other = (PositionWritable) o;
+ return this.getReadID() == other.getReadID() && this.getPosInRead() == other.getPosInRead();
+ }
+
+ @Override
+ public int compareTo(PositionWritable other) {
+ int diff1 = this.getReadID() - other.getReadID();
+ if (diff1 == 0) {
+ int diff2 = Math.abs((int) this.getPosInRead()) - Math.abs((int) other.getPosInRead());
+ if (diff2 == 0) {
+ return this.getPosInRead() - other.getPosInRead();
+ }
+ return diff2;
+ }
+ return diff1;
+ }
+
+ @Override
+ public String toString() {
+ return "(" + Integer.toString(getReadID()) + "," + Integer.toString((int) getPosInRead()) + ")";
+ }
+
+ /** A Comparator optimized for IntWritable. */
+ public static class Comparator extends WritableComparator {
+ public Comparator() {
+ super(PositionWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int thisValue = Marshal.getInt(b1, s1);
+ int thatValue = Marshal.getInt(b2, s2);
+ int diff1 = thisValue - thatValue;
+ if (diff1 == 0) {
+ int diff2 = Math.abs((int) b1[s1 + INTBYTES]) - Math.abs((int) b2[s2 + INTBYTES]);
+ if (diff2 == 0) {
+ return b1[s1 + INTBYTES] - b2[s2 + INTBYTES];
+ }
+ return diff2;
+ }
+ return diff1;
+ }
+ }
+
+ public static class FirstComparator implements RawComparator<PositionWritable> {
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ return WritableComparator.compareBytes(b1, s1, l1 - 1, b2, s2, l2 - 1);
+ }
+
+ @Override
+ public int compare(PositionWritable o1, PositionWritable o2) {
+ int l = o1.getReadID();
+ int r = o2.getReadID();
+ return l == r ? 0 : (l < r ? -1 : 1);
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(PositionWritable.class, new Comparator());
+ }
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableFactoryTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableFactoryTest.java
index 486e13b..7808719 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableFactoryTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableFactoryTest.java
@@ -19,8 +19,8 @@
import org.junit.Test;
import edu.uci.ics.genomix.type.GeneCode;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerBytesWritableFactory;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
public class KmerBytesWritableFactoryTest {
static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
@@ -29,17 +29,17 @@
@Test
public void TestGetLastKmer() {
- KmerBytesWritable kmer = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(9);
kmer.setByRead(array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
- KmerBytesWritable lastKmer;
+ VKmerBytesWritable lastKmer;
for (int i = 8; i > 0; i--) {
lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
lastKmer = kmerFactory.getSubKmerFromChain(9 - i, i, kmer);
Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
}
- KmerBytesWritable vlastKmer;
+ VKmerBytesWritable vlastKmer;
for (int i = 8; i > 0; i--) {
vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
@@ -50,17 +50,17 @@
@Test
public void TestGetFirstKmer() {
- KmerBytesWritable kmer = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(9);
kmer.setByRead(array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
- KmerBytesWritable firstKmer;
+ VKmerBytesWritable firstKmer;
for (int i = 8; i > 0; i--) {
firstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
Assert.assertEquals("AGCTGACCG".substring(0, i), firstKmer.toString());
firstKmer = kmerFactory.getSubKmerFromChain(0, i, kmer);
Assert.assertEquals("AGCTGACCG".substring(0, i), firstKmer.toString());
}
- KmerBytesWritable vfirstKmer;
+ VKmerBytesWritable vfirstKmer;
for (int i = 8; i > 0; i--) {
vfirstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
Assert.assertEquals("AGCTGACCG".substring(0, i), vfirstKmer.toString());
@@ -71,12 +71,12 @@
@Test
public void TestGetSubKmer() {
- KmerBytesWritable kmer = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(9);
kmer.setByRead(array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
- KmerBytesWritable subKmer;
- for (int istart = 0; istart < kmer.getKmerLength() - 1; istart++) {
- for (int isize = 1; isize + istart <= kmer.getKmerLength(); isize++) {
+ VKmerBytesWritable subKmer;
+ for (int istart = 0; istart < kmer.getKmerLetterLength() - 1; istart++) {
+ for (int isize = 1; isize + istart <= kmer.getKmerLetterLength(); isize++) {
subKmer = kmerFactory.getSubKmerFromChain(istart, isize, kmer);
Assert.assertEquals("AGCTGACCG".substring(istart, istart + isize), subKmer.toString());
}
@@ -85,60 +85,60 @@
@Test
public void TestMergeNext() {
- KmerBytesWritable kmer = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(9);
kmer.setByRead(array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
String text = "AGCTGACCG";
for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ VKmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
text = text + (char) GeneCode.GENE_SYMBOL[x];
Assert.assertEquals(text, newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
+ kmer = new VKmerBytesWritable(newkmer);
}
for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ VKmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
text = text + (char) GeneCode.GENE_SYMBOL[x];
Assert.assertEquals(text, newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
+ kmer = new VKmerBytesWritable(newkmer);
}
}
@Test
public void TestMergePre() {
- KmerBytesWritable kmer = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(9);
kmer.setByRead(array, 0);
Assert.assertEquals("AGCTGACCG", kmer.toString());
String text = "AGCTGACCG";
for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ VKmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
text = (char) GeneCode.GENE_SYMBOL[x] + text;
Assert.assertEquals(text, newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
+ kmer = new VKmerBytesWritable(newkmer);
}
for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
- KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ VKmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
text = (char) GeneCode.GENE_SYMBOL[x] + text;
Assert.assertEquals(text, newkmer.toString());
- kmer = new KmerBytesWritable(newkmer);
+ kmer = new VKmerBytesWritable(newkmer);
}
}
@Test
public void TestMergeTwoKmer() {
- KmerBytesWritable kmer1 = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(9);
kmer1.setByRead(array, 0);
String text1 = "AGCTGACCG";
- KmerBytesWritable kmer2 = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(9);
kmer2.setByRead(array, 1);
String text2 = "GCTGACCGT";
Assert.assertEquals(text1, kmer1.toString());
Assert.assertEquals(text2, kmer2.toString());
- KmerBytesWritable merged = kmerFactory.mergeTwoKmer(kmer1, kmer2);
+ VKmerBytesWritable merged = kmerFactory.mergeTwoKmer(kmer1, kmer2);
Assert.assertEquals(text1 + text2, merged.toString());
- KmerBytesWritable kmer3 = new KmerBytesWritable(3);
+ VKmerBytesWritable kmer3 = new VKmerBytesWritable(3);
kmer3.setByRead(array, 1);
String text3 = "GCT";
Assert.assertEquals(text3, kmer3.toString());
@@ -148,17 +148,17 @@
merged = kmerFactory.mergeTwoKmer(kmer3, kmer1);
Assert.assertEquals(text3 + text1, merged.toString());
- KmerBytesWritable kmer4 = new KmerBytesWritable(8);
+ VKmerBytesWritable kmer4 = new VKmerBytesWritable(8);
kmer4.setByRead(array, 0);
String text4 = "AGCTGACC";
Assert.assertEquals(text4, kmer4.toString());
merged = kmerFactory.mergeTwoKmer(kmer4, kmer3);
Assert.assertEquals(text4 + text3, merged.toString());
- KmerBytesWritable kmer5 = new KmerBytesWritable(7);
+ VKmerBytesWritable kmer5 = new VKmerBytesWritable(7);
kmer5.setByRead(array, 0);
String text5 = "AGCTGAC";
- KmerBytesWritable kmer6 = new KmerBytesWritable(9);
+ VKmerBytesWritable kmer6 = new VKmerBytesWritable(9);
kmer6.setByRead(9, array, 1);
String text6 = "GCTGACCGT";
merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
@@ -173,19 +173,18 @@
String text8 = "GCTG";
merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
Assert.assertEquals(text5 + text8, merged.toString());
-
}
@Test
public void TestShift() {
- KmerBytesWritable kmer = new KmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
+ VKmerBytesWritable kmer = new VKmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
String text = "AGCTGACCG";
Assert.assertEquals(text, kmer.toString());
- KmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer, GeneCode.A);
+ VKmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer, GeneCode.A);
Assert.assertEquals(text, kmer.toString());
Assert.assertEquals("GCTGACCGA", kmerForward.toString());
- KmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer, GeneCode.C);
+ VKmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer, GeneCode.C);
Assert.assertEquals(text, kmer.toString());
Assert.assertEquals("CAGCTGACC", kmerBackward.toString());
@@ -193,10 +192,10 @@
@Test
public void TestReverseKmer() {
- KmerBytesWritable kmer = new KmerBytesWritable(7);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(7);
kmer.setByRead(array, 0);
Assert.assertEquals(kmer.toString(), "AGCTGAC");
- KmerBytesWritable reversed = kmerFactory.reverse(kmer);
+ VKmerBytesWritable reversed = kmerFactory.reverse(kmer);
Assert.assertEquals(reversed.toString(), "CAGTCGA");
kmer.reset(8);
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
index fbd458e..81565a3 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
@@ -33,7 +33,8 @@
@Test
public void TestCompressKmer() {
- KmerBytesWritable kmer = new KmerBytesWritable(k);
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
kmer.setByRead(array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
@@ -43,7 +44,8 @@
@Test
public void TestMoveKmer() {
- KmerBytesWritable kmer = new KmerBytesWritable(k);
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
kmer.setByRead(array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
@@ -59,7 +61,8 @@
@Test
public void TestCompressKmerReverse() {
- KmerBytesWritable kmer = new KmerBytesWritable(k);
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
kmer.setByRead(array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
@@ -69,7 +72,8 @@
@Test
public void TestMoveKmerReverse() {
- KmerBytesWritable kmer = new KmerBytesWritable(k);
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
kmer.setByRead(array, 0);
Assert.assertEquals(kmer.toString(), "AATAGAA");
@@ -85,7 +89,8 @@
@Test
public void TestGetGene() {
- KmerBytesWritable kmer = new KmerBytesWritable(9);
+ KmerBytesWritable.setGlobalKmerLength(9);
+ KmerBytesWritable kmer = new KmerBytesWritable();
String text = "AGCTGACCG";
byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G' };
kmer.setByRead(array, 0);
@@ -100,8 +105,9 @@
byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
String string = "AGCTGACCGT";
for (int k = 3; k <= 10; k++) {
- KmerBytesWritable kmer = new KmerBytesWritable(k);
- KmerBytesWritable kmerAppend = new KmerBytesWritable(k);
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ KmerBytesWritable kmerAppend = new KmerBytesWritable();
kmer.setByRead(array, 0);
Assert.assertEquals(string.substring(0, k), kmer.toString());
for (int b = 0; b < k; b++) {
@@ -118,301 +124,4 @@
Assert.assertEquals(kmer.toString(), kmerAppend.toString());
}
}
-
- @Test
- public void TestMergeFFKmer() {
- byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
- String text = "AGCTGACCGT";
- KmerBytesWritable kmer1 = new KmerBytesWritable(8);
- kmer1.setByRead(array, 0);
- String text1 = "AGCTGACC";
- KmerBytesWritable kmer2 = new KmerBytesWritable(8);
- kmer2.setByRead(array, 1);
- String text2 = "GCTGACCG";
- Assert.assertEquals(text2, kmer2.toString());
- KmerBytesWritable merge = new KmerBytesWritable(kmer1);
- int kmerSize = 8;
- merge.mergeWithFFKmer(kmerSize, kmer2);
- Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
-
- for (int i = 1; i < 8; i++) {
- merge.set(kmer1);
- merge.mergeWithFFKmer(i, kmer2);
- Assert.assertEquals(text1 + text2.substring(i - 1), merge.toString());
- }
-
- for (int ik = 1; ik <= 10; ik++) {
- for (int jk = 1; jk <= 10; jk++) {
- kmer1 = new KmerBytesWritable(ik);
- kmer2 = new KmerBytesWritable(jk);
- kmer1.setByRead(array, 0);
- kmer2.setByRead(array, 0);
- text1 = text.substring(0, ik);
- text2 = text.substring(0, jk);
- Assert.assertEquals(text1, kmer1.toString());
- Assert.assertEquals(text2, kmer2.toString());
- for (int x = 1; x < jk; x++) {
- merge.set(kmer1);
- merge.mergeWithFFKmer(x, kmer2);
- Assert.assertEquals(text1 + text2.substring(x - 1), merge.toString());
- }
- }
- }
- }
-
- @Test
- public void TestMergeFRKmer() {
- int kmerSize = 3;
- String result = "AAGCTAACAACC";
- byte[] resultArray = result.getBytes();
-
- String text1 = "AAGCTAA";
- KmerBytesWritable kmer1 = new KmerBytesWritable(text1.length());
- kmer1.setByRead(resultArray, 0);
- Assert.assertEquals(text1, kmer1.toString());
-
- // kmer2 is the rc of the end of the read
- String text2 = "GGTTGTT";
- KmerBytesWritable kmer2 = new KmerBytesWritable(text2.length());
- kmer2.setByReadReverse(resultArray, result.length() - text2.length());
- Assert.assertEquals(text2, kmer2.toString());
-
- KmerBytesWritable merge = new KmerBytesWritable(kmer1);
- merge.mergeWithFRKmer(kmerSize, kmer2);
- Assert.assertEquals(result, merge.toString());
-
- int i = 1;
- merge.set(kmer1);
- merge.mergeWithFRKmer(i, kmer2);
- Assert.assertEquals("AAGCTAAAACAACC", merge.toString());
-
- i = 2;
- merge.set(kmer1);
- merge.mergeWithFRKmer(i, kmer2);
- Assert.assertEquals("AAGCTAAACAACC", merge.toString());
-
- i = 3;
- merge.set(kmer1);
- merge.mergeWithFRKmer(i, kmer2);
- Assert.assertEquals("AAGCTAACAACC", merge.toString());
- }
-
-
- @Test
- public void TestMergeRFKmer() {
- int kmerSize = 3;
- String result = "GGCACAACAACCC";
- byte[] resultArray = result.getBytes();
-
- String text1 = "AACAACCC";
- KmerBytesWritable kmer1 = new KmerBytesWritable(text1.length());
- kmer1.setByRead(resultArray, 5);
- Assert.assertEquals(text1, kmer1.toString());
-
- // kmer2 is the rc of the end of the read
- String text2 = "TTGTGCC";
- KmerBytesWritable kmer2 = new KmerBytesWritable(text2.length());
- kmer2.setByReadReverse(resultArray, 0);
- Assert.assertEquals(text2, kmer2.toString());
-
- KmerBytesWritable merge = new KmerBytesWritable(kmer1);
- merge.mergeWithRFKmer(kmerSize, kmer2);
- Assert.assertEquals(result, merge.toString());
-
- int i = 1;
- merge.set(kmer1);
- merge.mergeWithRFKmer(i, kmer2);
- Assert.assertEquals("GGCACAAAACAACCC", merge.toString());
-
- i = 2;
- merge.set(kmer1);
- merge.mergeWithRFKmer(i, kmer2);
- Assert.assertEquals("GGCACAAACAACCC", merge.toString());
-
- i = 3;
- merge.set(kmer1);
- merge.mergeWithRFKmer(i, kmer2);
- Assert.assertEquals("GGCACAACAACCC", merge.toString());
-
- String test1;
- String test2;
- test1 = "CTA";
- test2 = "AGA";
- KmerBytesWritable k1 = new KmerBytesWritable(3);
- KmerBytesWritable k2 = new KmerBytesWritable(3);
- k1.setByRead(test1.getBytes(), 0);
- k2.setByRead(test2.getBytes(), 0);
- k1.mergeWithRFKmer(3, k2);
- Assert.assertEquals("TCTA", k1.toString());
-
- test1 = "CTA";
- test2 = "ATA"; //TAT
- k1 = new KmerBytesWritable(3);
- k2 = new KmerBytesWritable(3);
- k1.setByRead(test1.getBytes(), 0);
- k2.setByRead(test2.getBytes(), 0);
- k1.mergeWithFRKmer(3, k2);
- Assert.assertEquals("CTAT", k1.toString());
-
- test1 = "ATA";
- test2 = "CTA"; //TAT
- k1 = new KmerBytesWritable(3);
- k2 = new KmerBytesWritable(3);
- k1.setByRead(test1.getBytes(), 0);
- k2.setByRead(test2.getBytes(), 0);
- k1.mergeWithFRKmer(3, k2);
- Assert.assertEquals("ATAG", k1.toString());
-
- test1 = "TCTAT";
- test2 = "GAAC";
- k1 = new KmerBytesWritable(5);
- k2 = new KmerBytesWritable(4);
- k1.setByRead(test1.getBytes(), 0);
- k2.setByRead(test2.getBytes(), 0);
- k1.mergeWithRFKmer(3, k2);
- Assert.assertEquals("GTTCTAT", k1.toString());
- }
-
-
-
- @Test
- public void TestMergeRRKmer() {
- byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
- String text = "AGCTGACCGT";
- KmerBytesWritable kmer1 = new KmerBytesWritable(8);
- kmer1.setByRead(array, 0);
- String text1 = "AGCTGACC";
- KmerBytesWritable kmer2 = new KmerBytesWritable(8);
- kmer2.setByRead(array, 1);
- String text2 = "GCTGACCG";
- Assert.assertEquals(text2, kmer2.toString());
- KmerBytesWritable merge = new KmerBytesWritable(kmer2);
- int kmerSize = 8;
- merge.mergeWithRRKmer(kmerSize, kmer1);
- Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
-
- for (int i = 1; i < 8; i++) {
- merge.set(kmer2);
- merge.mergeWithRRKmer(i, kmer1);
- Assert.assertEquals(text1.substring(0, text1.length() - i + 1) + text2, merge.toString());
- }
-
- for (int ik = 1; ik <= 10; ik++) {
- for (int jk = 1; jk <= 10; jk++) {
- kmer1 = new KmerBytesWritable(ik);
- kmer2 = new KmerBytesWritable(jk);
- kmer1.setByRead(array, 0);
- kmer2.setByRead(array, 0);
- text1 = text.substring(0, ik);
- text2 = text.substring(0, jk);
- Assert.assertEquals(text1, kmer1.toString());
- Assert.assertEquals(text2, kmer2.toString());
- for (int x = 1; x < ik; x++) {
- merge.set(kmer2);
- merge.mergeWithRRKmer(x, kmer1);
- Assert.assertEquals(text1.substring(0, text1.length() - x + 1) + text2, merge.toString());
- }
- }
- }
- }
-
- @Test
- public void TestFinalMerge() {
- String selfString;
- String match;
- String msgString;
- int index;
- KmerBytesWritable kmer = new KmerBytesWritable();
- int kmerSize = 3;
-
- String F1 = "AATAG";
- String F2 = "TAGAA";
- String R1 = "CTATT";
- String R2 = "TTCTA";
-
- //FF test
- selfString = F1;
- match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
- msgString = F2;
- index = msgString.indexOf(match);
- kmer.reset(msgString.length() - index);
- kmer.setByRead(msgString.substring(index).getBytes(), 0);
- System.out.println(kmer.toString());
-
- //FR test
- selfString = F1;
- match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
- msgString = GeneCode.reverseComplement(R2);
- index = msgString.indexOf(match);
- kmer.reset(msgString.length() - index);
- kmer.setByRead(msgString.substring(index).getBytes(), 0);
- System.out.println(kmer.toString());
-
- //RF test
- selfString = R1;
- match = selfString.substring(0,kmerSize - 1);
- msgString = GeneCode.reverseComplement(F2);
- index = msgString.lastIndexOf(match) + kmerSize - 2;
- kmer.reset(index + 1);
- kmer.setByReadReverse(msgString.substring(0, index + 1).getBytes(), 0);
- System.out.println(kmer.toString());
-
- //RR test
- selfString = R1;
- match = selfString.substring(0,kmerSize - 1);
- msgString = R2;
- index = msgString.lastIndexOf(match) + kmerSize - 2;
- kmer.reset(index + 1);
- kmer.setByRead(msgString.substring(0, index + 1).getBytes(), 0);
- System.out.println(kmer.toString());
-
- String[][] connectedTable = new String[][]{
- {"FF", "RF"},
- {"FF", "RR"},
- {"FR", "RF"},
- {"FR", "RR"}
- };
- System.out.println(connectedTable[0][1]);
-
- Set<Long> s1 = new HashSet<Long>();
- Set<Long> s2 = new HashSet<Long>();
- s1.add((long) 1);
- s1.add((long) 2);
- s2.add((long) 2);
- s2.add((long) 3);
- Set<Long> intersection = new HashSet<Long>();
- intersection.addAll(s1);
- intersection.retainAll(s2);
- System.out.println(intersection.toString());
- Set<Long> difference = new HashSet<Long>();
- difference.addAll(s1);
- difference.removeAll(s2);
- System.out.println(difference.toString());
-
- Map<KmerBytesWritable, Set<Long>> map = new HashMap<KmerBytesWritable, Set<Long>>();
- KmerBytesWritable k1 = new KmerBytesWritable(3);
- Set<Long> set1 = new HashSet<Long>();
- k1.setByRead(("CTA").getBytes(), 0);
- set1.add((long)1);
- map.put(k1, set1);
- KmerBytesWritable k2 = new KmerBytesWritable(3);
- k2.setByRead(("GTA").getBytes(), 0);
- Set<Long> set2 = new HashSet<Long>();
- set2.add((long) 2);
- map.put(k2, set2);
- KmerBytesWritable k3 = new KmerBytesWritable(3);
- k3.setByRead(("ATG").getBytes(), 0);
- Set<Long> set3 = new HashSet<Long>();
- set3.add((long) 3);
- map.put(k3, set3);
- KmerBytesWritable k4 = new KmerBytesWritable(3);
- k4.setByRead(("AAT").getBytes(), 0);
- Set<Long> set4 = new HashSet<Long>();
- set4.add((long) 4);
- map.put(k4, set4);
- System.out.println("CTA = " + map.get(k1).toString());
- System.out.println("GTA = " + map.get(k2).toString());
- System.out.println("ATG = " + map.get(k3).toString());
- System.out.println("AAT = " + map.get(k4).toString());
- }
}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java.orig b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java.orig
new file mode 100644
index 0000000..8a0cb6d
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java.orig
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.test;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+public class KmerBytesWritableTest {
+ static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
+ static int k = 7;
+
+ @Test
+ public void TestCompressKmer() {
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByRead(array, 1);
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestMoveKmer() {
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithNextCode(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithNextChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestCompressKmerReverse() {
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByReadReverse(array, 1);
+ Assert.assertEquals(kmer.toString(), "CTTCTAT");
+ }
+
+ @Test
+ public void TestMoveKmerReverse() {
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithPreChar(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithPreChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "GAATAGA");
+ }
+
+ @Test
+ public void TestGetGene() {
+ KmerBytesWritable.setGlobalKmerLength(9);
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ String text = "AGCTGACCG";
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G' };
+ kmer.setByRead(array, 0);
+
+ for (int i = 0; i < 9; i++) {
+ Assert.assertEquals(text.charAt(i), (char) (GeneCode.getSymbolFromCode(kmer.getGeneCodeAtPosition(i))));
+ }
+ }
+
+ @Test
+ public void TestGetOneByteFromKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String string = "AGCTGACCGT";
+ for (int k = 3; k <= 10; k++) {
+ KmerBytesWritable.setGlobalKmerLength(k);
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ KmerBytesWritable kmerAppend = new KmerBytesWritable();
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(string.substring(0, k), kmer.toString());
+ for (int b = 0; b < k; b++) {
+ byte byteActual = KmerBytesWritable.getOneByteFromKmerAtPosition(b, kmer.getBytes(), kmer.getOffset(),
+ kmer.getLength());
+ byte byteExpect = GeneCode.getCodeFromSymbol(array[b]);
+ for (int i = 1; i < 4 && b + i < k; i++) {
+ byteExpect += GeneCode.getCodeFromSymbol(array[b + i]) << (i * 2);
+ }
+ Assert.assertEquals(byteActual, byteExpect);
+ KmerBytesWritable.appendOneByteAtPosition(b, byteActual, kmerAppend.getBytes(), kmerAppend.getOffset(),
+ kmerAppend.getLength());
+ }
+ Assert.assertEquals(kmer.toString(), kmerAppend.toString());
+ }
+ }
+<<<<<<< HEAD
+=======
+
+ @Test
+ public void TestMergeFFKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String text = "AGCTGACCGT";
+ KmerBytesWritable kmer1 = new KmerBytesWritable(8);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACC";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(8);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCG";
+ Assert.assertEquals(text2, kmer2.toString());
+ KmerBytesWritable merge = new KmerBytesWritable(kmer1);
+ int kmerSize = 8;
+ merge.mergeWithFFKmer(kmerSize, kmer2);
+ Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
+
+ for (int i = 1; i < 8; i++) {
+ merge.set(kmer1);
+ merge.mergeWithFFKmer(i, kmer2);
+ Assert.assertEquals(text1 + text2.substring(i - 1), merge.toString());
+ }
+
+ for (int ik = 1; ik <= 10; ik++) {
+ for (int jk = 1; jk <= 10; jk++) {
+ kmer1 = new KmerBytesWritable(ik);
+ kmer2 = new KmerBytesWritable(jk);
+ kmer1.setByRead(array, 0);
+ kmer2.setByRead(array, 0);
+ text1 = text.substring(0, ik);
+ text2 = text.substring(0, jk);
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+ for (int x = 1; x < jk; x++) {
+ merge.set(kmer1);
+ merge.mergeWithFFKmer(x, kmer2);
+ Assert.assertEquals(text1 + text2.substring(x - 1), merge.toString());
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestMergeFRKmer() {
+ int kmerSize = 3;
+ String result = "AAGCTAACAACC";
+ byte[] resultArray = result.getBytes();
+
+ String text1 = "AAGCTAA";
+ KmerBytesWritable kmer1 = new KmerBytesWritable(text1.length());
+ kmer1.setByRead(resultArray, 0);
+ Assert.assertEquals(text1, kmer1.toString());
+
+ // kmer2 is the rc of the end of the read
+ String text2 = "GGTTGTT";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(text2.length());
+ kmer2.setByReadReverse(resultArray, result.length() - text2.length());
+ Assert.assertEquals(text2, kmer2.toString());
+
+ KmerBytesWritable merge = new KmerBytesWritable(kmer1);
+ merge.mergeWithFRKmer(kmerSize, kmer2);
+ Assert.assertEquals(result, merge.toString());
+
+ int i = 1;
+ merge.set(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAAAACAACC", merge.toString());
+
+ i = 2;
+ merge.set(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAAACAACC", merge.toString());
+
+ i = 3;
+ merge.set(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAACAACC", merge.toString());
+ }
+
+
+ @Test
+ public void TestMergeRFKmer() {
+ int kmerSize = 3;
+ String result = "GGCACAACAACCC";
+ byte[] resultArray = result.getBytes();
+
+ String text1 = "AACAACCC";
+ KmerBytesWritable kmer1 = new KmerBytesWritable(text1.length());
+ kmer1.setByRead(resultArray, 5);
+ Assert.assertEquals(text1, kmer1.toString());
+
+ // kmer2 is the rc of the end of the read
+ String text2 = "TTGTGCC";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(text2.length());
+ kmer2.setByReadReverse(resultArray, 0);
+ Assert.assertEquals(text2, kmer2.toString());
+
+ KmerBytesWritable merge = new KmerBytesWritable(kmer1);
+ merge.mergeWithRFKmer(kmerSize, kmer2);
+ Assert.assertEquals(result, merge.toString());
+
+ int i = 1;
+ merge.set(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAAAACAACCC", merge.toString());
+
+ i = 2;
+ merge.set(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAAACAACCC", merge.toString());
+
+ i = 3;
+ merge.set(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAACAACCC", merge.toString());
+
+ String test1;
+ String test2;
+ test1 = "CTA";
+ test2 = "AGA";
+ KmerBytesWritable k1 = new KmerBytesWritable(3);
+ KmerBytesWritable k2 = new KmerBytesWritable(3);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k1.mergeWithRFKmer(3, k2);
+ Assert.assertEquals("TCTA", k1.toString());
+
+ test1 = "CTA";
+ test2 = "ATA"; //TAT
+ k1 = new KmerBytesWritable(3);
+ k2 = new KmerBytesWritable(3);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k1.mergeWithFRKmer(3, k2);
+ Assert.assertEquals("CTAT", k1.toString());
+
+ test1 = "ATA";
+ test2 = "CTA"; //TAT
+ k1 = new KmerBytesWritable(3);
+ k2 = new KmerBytesWritable(3);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k1.mergeWithFRKmer(3, k2);
+ Assert.assertEquals("ATAG", k1.toString());
+
+ test1 = "TCTAT";
+ test2 = "GAAC";
+ k1 = new KmerBytesWritable(5);
+ k2 = new KmerBytesWritable(4);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k1.mergeWithRFKmer(3, k2);
+ Assert.assertEquals("GTTCTAT", k1.toString());
+ }
+
+
+
+ @Test
+ public void TestMergeRRKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String text = "AGCTGACCGT";
+ KmerBytesWritable kmer1 = new KmerBytesWritable(8);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACC";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(8);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCG";
+ Assert.assertEquals(text2, kmer2.toString());
+ KmerBytesWritable merge = new KmerBytesWritable(kmer2);
+ int kmerSize = 8;
+ merge.mergeWithRRKmer(kmerSize, kmer1);
+ Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
+
+ for (int i = 1; i < 8; i++) {
+ merge.set(kmer2);
+ merge.mergeWithRRKmer(i, kmer1);
+ Assert.assertEquals(text1.substring(0, text1.length() - i + 1) + text2, merge.toString());
+ }
+
+ for (int ik = 1; ik <= 10; ik++) {
+ for (int jk = 1; jk <= 10; jk++) {
+ kmer1 = new KmerBytesWritable(ik);
+ kmer2 = new KmerBytesWritable(jk);
+ kmer1.setByRead(array, 0);
+ kmer2.setByRead(array, 0);
+ text1 = text.substring(0, ik);
+ text2 = text.substring(0, jk);
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+ for (int x = 1; x < ik; x++) {
+ merge.set(kmer2);
+ merge.mergeWithRRKmer(x, kmer1);
+ Assert.assertEquals(text1.substring(0, text1.length() - x + 1) + text2, merge.toString());
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestFinalMerge() {
+ String selfString;
+ String match;
+ String msgString;
+ int index;
+ KmerBytesWritable kmer = new KmerBytesWritable();
+ int kmerSize = 3;
+
+ String F1 = "AATAG";
+ String F2 = "TAGAA";
+ String R1 = "CTATT";
+ String R2 = "TTCTA";
+
+ //FF test
+ selfString = F1;
+ match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
+ msgString = F2;
+ index = msgString.indexOf(match);
+ kmer.reset(msgString.length() - index);
+ kmer.setByRead(msgString.substring(index).getBytes(), 0);
+ System.out.println(kmer.toString());
+
+ //FR test
+ selfString = F1;
+ match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
+ msgString = GeneCode.reverseComplement(R2);
+ index = msgString.indexOf(match);
+ kmer.reset(msgString.length() - index);
+ kmer.setByRead(msgString.substring(index).getBytes(), 0);
+ System.out.println(kmer.toString());
+
+ //RF test
+ selfString = R1;
+ match = selfString.substring(0,kmerSize - 1);
+ msgString = GeneCode.reverseComplement(F2);
+ index = msgString.lastIndexOf(match) + kmerSize - 2;
+ kmer.reset(index + 1);
+ kmer.setByReadReverse(msgString.substring(0, index + 1).getBytes(), 0);
+ System.out.println(kmer.toString());
+
+ //RR test
+ selfString = R1;
+ match = selfString.substring(0,kmerSize - 1);
+ msgString = R2;
+ index = msgString.lastIndexOf(match) + kmerSize - 2;
+ kmer.reset(index + 1);
+ kmer.setByRead(msgString.substring(0, index + 1).getBytes(), 0);
+ System.out.println(kmer.toString());
+
+ String[][] connectedTable = new String[][]{
+ {"FF", "RF"},
+ {"FF", "RR"},
+ {"FR", "RF"},
+ {"FR", "RR"}
+ };
+ System.out.println(connectedTable[0][1]);
+
+ Set<Long> s1 = new HashSet<Long>();
+ Set<Long> s2 = new HashSet<Long>();
+ s1.add((long) 1);
+ s1.add((long) 2);
+ s2.add((long) 2);
+ s2.add((long) 3);
+ Set<Long> intersection = new HashSet<Long>();
+ intersection.addAll(s1);
+ intersection.retainAll(s2);
+ System.out.println(intersection.toString());
+ Set<Long> difference = new HashSet<Long>();
+ difference.addAll(s1);
+ difference.removeAll(s2);
+ System.out.println(difference.toString());
+
+ Map<KmerBytesWritable, Set<Long>> map = new HashMap<KmerBytesWritable, Set<Long>>();
+ KmerBytesWritable k1 = new KmerBytesWritable(3);
+ Set<Long> set1 = new HashSet<Long>();
+ k1.setByRead(("CTA").getBytes(), 0);
+ set1.add((long)1);
+ map.put(k1, set1);
+ KmerBytesWritable k2 = new KmerBytesWritable(3);
+ k2.setByRead(("GTA").getBytes(), 0);
+ Set<Long> set2 = new HashSet<Long>();
+ set2.add((long) 2);
+ map.put(k2, set2);
+ KmerBytesWritable k3 = new KmerBytesWritable(3);
+ k3.setByRead(("ATG").getBytes(), 0);
+ Set<Long> set3 = new HashSet<Long>();
+ set3.add((long) 3);
+ map.put(k3, set3);
+ KmerBytesWritable k4 = new KmerBytesWritable(3);
+ k4.setByRead(("AAT").getBytes(), 0);
+ Set<Long> set4 = new HashSet<Long>();
+ set4.add((long) 4);
+ map.put(k4, set4);
+ System.out.println("CTA = " + map.get(k1).toString());
+ System.out.println("GTA = " + map.get(k2).toString());
+ System.out.println("ATG = " + map.get(k3).toString());
+ System.out.println("AAT = " + map.get(k4).toString());
+ }
+>>>>>>> 94e075b5c3db9aa613ef61c2581430a143b17bc8
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerListWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerListWritableTest.java
index 3c1428a..2f7bba8 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerListWritableTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerListWritableTest.java
@@ -20,20 +20,22 @@
//one kmer in list and reset each time
KmerBytesWritable kmer;
for (int i = 1; i < 200; i++) {
- kmer = new KmerBytesWritable(i);
+ KmerBytesWritable.setGlobalKmerLength(i);
+ kmer = new KmerBytesWritable();
String randomString = generaterRandomString(i);
byte[] array = randomString.getBytes();
kmer.setByRead(array, 0);
- kmerList.reset(kmer.getKmerLength());
+ kmerList.reset();
kmerList.append(kmer);
- Assert.assertEquals(kmerList.getPosition(0).toString(), randomString);
+ Assert.assertEquals(randomString, kmerList.getPosition(0).toString());
Assert.assertEquals(1, kmerList.getCountOfPosition());
}
- kmerList.reset(0);
+ kmerList.reset();
+ KmerBytesWritable.setGlobalKmerLength(5);
//add one more kmer each time and fix kmerSize
for (int i = 0; i < 200; i++) {
- kmer = new KmerBytesWritable(5);
+ kmer = new KmerBytesWritable();
String randomString = generaterRandomString(5);
byte[] array = randomString.getBytes();
kmer.setByRead(array, 0);
@@ -44,8 +46,8 @@
byte [] another = new byte [kmerList.getLength()*2];
int start = 20;
- System.arraycopy(kmerList.getByteArray(), 0, another, start, kmerList.getLength());
- KmerListWritable plist2 = new KmerListWritable(kmerList.kmerlength, kmerList.getCountOfPosition(),another,start);
+ System.arraycopy(kmerList.getByteArray(), kmerList.getStartOffset(), another, start, kmerList.getLength());
+ KmerListWritable plist2 = new KmerListWritable(another, start);
for(int i = 0; i < plist2.getCountOfPosition(); i++){
Assert.assertEquals(kmerList.getPosition(i).toString(), plist2.getPosition(i).toString());
}
@@ -59,12 +61,13 @@
int i;
KmerBytesWritable kmer;
for (i = 0; i < 200; i++) {
- kmer = new KmerBytesWritable(5);
+ KmerBytesWritable.setGlobalKmerLength(5);
+ kmer = new KmerBytesWritable();
String randomString = generaterRandomString(5);
byte[] array = randomString.getBytes();
kmer.setByRead(array, 0);
kmerList.append(kmer);
- Assert.assertEquals(kmerList.getPosition(i).toString(), randomString);
+ Assert.assertEquals(randomString, kmerList.getPosition(i).toString());
Assert.assertEquals(i + 1, kmerList.getCountOfPosition());
}
@@ -72,12 +75,12 @@
KmerBytesWritable tmpKmer = new KmerBytesWritable();
i = 0;
KmerListWritable copyList = new KmerListWritable();
- copyList.set(kmerList);
+ copyList.setCopy(kmerList);
Iterator<KmerBytesWritable> iterator;
for(int j = 0; j < 5; j++){
iterator = copyList.iterator();
byte[] array = kmerList.getPosition(j).toString().getBytes();
- KmerBytesWritable deletePos = new KmerBytesWritable(5);
+ KmerBytesWritable deletePos = new KmerBytesWritable();
deletePos.setByRead(array, 0);
while(iterator.hasNext()){
tmpKmer = iterator.next();
@@ -104,8 +107,9 @@
Assert.assertEquals(0, kmerList.getCountOfPosition());
- KmerListWritable edgeList = new KmerListWritable(3);
- KmerBytesWritable k = new KmerBytesWritable(3);
+ KmerBytesWritable.setGlobalKmerLength(3);
+ KmerListWritable edgeList = new KmerListWritable();
+ KmerBytesWritable k = new KmerBytesWritable();
k.setByRead(("AAA").getBytes(), 0);
edgeList.append(k);
k.setByRead(("CCC").getBytes(), 0);
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/PositionListWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/PositionListWritableTest.java
index fea658d..ac7322e 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/PositionListWritableTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/PositionListWritableTest.java
@@ -41,7 +41,7 @@
byte [] another = new byte [plist.getLength()*2];
int start = 20;
System.arraycopy(plist.getByteArray(), 0, another, start, plist.getLength());
- PositionListWritable plist2 = new PositionListWritable(plist.getCountOfPosition(),another,start);
+ PositionListWritable plist2 = new PositionListWritable(another,start);
for( i = 0; i < plist2.getCountOfPosition(); i++){
Assert.assertEquals(plist.getPosition(i), plist2.getPosition(i));
}
@@ -84,19 +84,22 @@
iterator = copyList.iterator();
PositionWritable deletePos = new PositionWritable();
deletePos.set((byte)1, (long)j, j);
+ boolean removed = false;
while(iterator.hasNext()){
pos = iterator.next();
if(pos.equals(deletePos)){
iterator.remove();
+ removed = true;
break;
}
}
+ Assert.assertTrue(removed);
Assert.assertEquals(5 - 1 - j, copyList.getCountOfPosition());
while(iterator.hasNext()){
pos = iterator.next();
- Assert.assertTrue(pos.getUUID() != deletePos.getUUID());
- Assert.assertTrue(pos.getReadId() != deletePos.getReadId());
- Assert.assertTrue(pos.getPosId() != deletePos.getPosId());
+ Assert.assertTrue(! (pos.getUUID() == deletePos.getUUID() &&
+ pos.getReadId() == deletePos.getReadId() &&
+ pos.getPosId() == deletePos.getPosId()));
i++;
}
}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/VKmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/VKmerBytesWritableTest.java
new file mode 100644
index 0000000..5dd4f82
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/VKmerBytesWritableTest.java
@@ -0,0 +1,399 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.test;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class VKmerBytesWritableTest {
+ static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
+ static int k = 7;
+
+ @Test
+ public void TestCompressKmer() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByRead(array, 1);
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestMoveKmer() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithNextCode(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithNextChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestCompressKmerReverse() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByReadReverse(array, 1);
+ Assert.assertEquals(kmer.toString(), "CTTCTAT");
+ }
+
+ @Test
+ public void TestMoveKmerReverse() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithPreChar(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithPreChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "GAATAGA");
+ }
+
+ @Test
+ public void TestGetGene() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(9);
+ String text = "AGCTGACCG";
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G' };
+ kmer.setByRead(array, 0);
+
+ for (int i = 0; i < 9; i++) {
+ Assert.assertEquals(text.charAt(i), (char) (GeneCode.getSymbolFromCode(kmer.getGeneCodeAtPosition(i))));
+ }
+ }
+
+ @Test
+ public void TestGetOneByteFromKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String string = "AGCTGACCGT";
+ for (int k = 3; k <= 10; k++) {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ VKmerBytesWritable kmerAppend = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(string.substring(0, k), kmer.toString());
+ for (int b = 0; b < k; b++) {
+ byte byteActual = KmerBytesWritable.getOneByteFromKmerAtPosition(b, kmer.getBytes(),
+ kmer.getKmerOffset(), kmer.getKmerByteLength());
+ byte byteExpect = GeneCode.getCodeFromSymbol(array[b]);
+ for (int i = 1; i < 4 && b + i < k; i++) {
+ byteExpect += GeneCode.getCodeFromSymbol(array[b + i]) << (i * 2);
+ }
+ Assert.assertEquals(byteActual, byteExpect);
+ KmerBytesWritable.appendOneByteAtPosition(b, byteActual, kmerAppend.getBytes(),
+ kmerAppend.getKmerOffset(), kmerAppend.getKmerByteLength());
+ }
+ Assert.assertEquals(kmer.toString(), kmerAppend.toString());
+ }
+ }
+
+ @Test
+ public void TestMergeFFKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String text = "AGCTGACCGT";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(8);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACC";
+ Assert.assertEquals(text1, kmer1.toString());
+
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(8);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCG";
+ Assert.assertEquals(text2, kmer2.toString());
+
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer1);
+ int kmerSize = 8;
+ merge.mergeWithFFKmer(kmerSize, kmer2);
+ Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
+
+ for (int i = 1; i < 8; i++) {
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFFKmer(i, kmer2);
+ Assert.assertEquals(text1 + text2.substring(i - 1), merge.toString());
+ }
+
+ for (int ik = 1; ik <= 10; ik++) {
+ for (int jk = 1; jk <= 10; jk++) {
+ kmer1 = new VKmerBytesWritable(ik);
+ kmer2 = new VKmerBytesWritable(jk);
+ kmer1.setByRead(array, 0);
+ kmer2.setByRead(array, 0);
+ text1 = text.substring(0, ik);
+ text2 = text.substring(0, jk);
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+ for (int x = 1; x < jk; x++) {
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFFKmer(x, kmer2);
+ Assert.assertEquals(text1 + text2.substring(x - 1), merge.toString());
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestMergeFRKmer() {
+ int kmerSize = 3;
+ String result = "AAGCTAACAACC";
+ byte[] resultArray = result.getBytes();
+
+ String text1 = "AAGCTAA";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(text1.length());
+ kmer1.setByRead(resultArray, 0);
+ Assert.assertEquals(text1, kmer1.toString());
+
+ // kmer2 is the rc of the end of the read
+ String text2 = "GGTTGTT";
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(text2.length());
+ kmer2.setByReadReverse(resultArray, result.length() - text2.length());
+ Assert.assertEquals(text2, kmer2.toString());
+
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer1);
+ merge.mergeWithFRKmer(kmerSize, kmer2);
+ Assert.assertEquals(result, merge.toString());
+
+ int i = 1;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAAAACAACC", merge.toString());
+
+ i = 2;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAAACAACC", merge.toString());
+
+ i = 3;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAACAACC", merge.toString());
+ }
+
+ @Test
+ public void TestMergeRFKmer() {
+ int kmerSize = 3;
+ String result = "GGCACAACAACCC";
+ byte[] resultArray = result.getBytes();
+
+ String text1 = "AACAACCC";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(text1.length());
+ kmer1.setByRead(resultArray, 5);
+ Assert.assertEquals(text1, kmer1.toString());
+
+ // kmer2 is the rc of the end of the read
+ String text2 = "TTGTGCC";
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(text2.length());
+ kmer2.setByReadReverse(resultArray, 0);
+ Assert.assertEquals(text2, kmer2.toString());
+
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer1);
+ merge.mergeWithRFKmer(kmerSize, kmer2);
+ Assert.assertEquals(result, merge.toString());
+
+ int i = 1;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAAAACAACCC", merge.toString());
+
+ i = 2;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAAACAACCC", merge.toString());
+
+ i = 3;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAACAACCC", merge.toString());
+
+ // String test1 = "CTTAT";
+ // String test2 = "AGACC"; // rc = GGTCT
+ // VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ // VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ // k1.setByRead(test1.getBytes(), 0);
+ // k2.setByRead(test2.getBytes(), 0);
+ // k1.mergeWithRFKmer(3, k2);
+ // Assert.assertEquals("GGTCTTAT", k1.toString()); //GGTCGTCT ->
+ // AGACGACC ??
+
+ String test3 = "CTA";
+ String test4 = "AGA"; // rc = TCT
+ VKmerBytesWritable k3 = new VKmerBytesWritable(3);
+ VKmerBytesWritable k4 = new VKmerBytesWritable(3);
+ k3.setByRead(test3.getBytes(), 0);
+ k4.setByRead(test4.getBytes(), 0);
+ k3.mergeWithRFKmer(3, k4);
+ Assert.assertEquals("TCTA", k3.toString());
+ // Assert.assertEquals("CTAT", k3); // this is an incorrect test case--
+ // the merge always flips the passed-in kmer
+ }
+
+ @Test
+ public void TestMergeRRKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String text = "AGCTGACCGT";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(8);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACC";
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(8);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCG";
+ Assert.assertEquals(text2, kmer2.toString());
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer2);
+ int kmerSize = 8;
+ merge.mergeWithRRKmer(kmerSize, kmer1);
+ Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
+
+ for (int i = 1; i < 8; i++) {
+ merge.setAsCopy(kmer2);
+ merge.mergeWithRRKmer(i, kmer1);
+ Assert.assertEquals(text1.substring(0, text1.length() - i + 1) + text2, merge.toString());
+ }
+
+ for (int ik = 1; ik <= 10; ik++) {
+ for (int jk = 1; jk <= 10; jk++) {
+ kmer1 = new VKmerBytesWritable(ik);
+ kmer2 = new VKmerBytesWritable(jk);
+ kmer1.setByRead(array, 0);
+ kmer2.setByRead(array, 0);
+ text1 = text.substring(0, ik);
+ text2 = text.substring(0, jk);
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+ for (int x = 1; x < ik; x++) {
+ merge.setAsCopy(kmer2);
+ merge.mergeWithRRKmer(x, kmer1);
+ Assert.assertEquals(text1.substring(0, text1.length() - x + 1) + text2, merge.toString());
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestMergeRFAndRRKmer() {
+ String test1 = "TAGAT";
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "GCTAG";
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k1.mergeWithRFKmer(5, k2);
+ Assert.assertEquals("CTAGAT", k1.toString());
+ k1.mergeWithRRKmer(5, k3);
+ Assert.assertEquals("GCTAGAT", k1.toString());
+ }
+
+ @Test
+ public void TestMergeRFAndRFKmer() {
+ String test1 = "TAGAT";
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k1.mergeWithRFKmer(5, k2);
+ Assert.assertEquals("CTAGAT", k1.toString());
+ k1.mergeWithRFKmer(5, k3);
+ Assert.assertEquals("GCTAGAT", k1.toString());
+ }
+
+ @Test
+ public void TestMergeRFAndFRKmer() {
+ String test1 = "TAGAT"; // rc = ATCTA
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "GCTAG"; // rc = CTAGC
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k2.mergeWithRFKmer(5, k1);
+ Assert.assertEquals("ATCTAG", k2.toString());
+ k2.mergeWithFRKmer(5, k3);
+ Assert.assertEquals("ATCTAGC", k2.toString());
+ }
+
+ @Test
+ public void TestMergeRFAndFFKmer() {
+ String test1 = "TAGAT"; // rc = ATCTA
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k2.mergeWithRFKmer(5, k1);
+ Assert.assertEquals("ATCTAG", k2.toString());
+ k2.mergeWithFFKmer(5, k3);
+ Assert.assertEquals("ATCTAGC", k2.toString());
+ }
+
+ @Test
+ public void TestMergeThreeVKmersRF_FF() {
+ String test1 = "TAGAT"; // rc = ATCTA
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k2.mergeWithRFKmer(5, k1);
+ Assert.assertEquals("ATCTAG", k2.toString());
+ k2.mergeWithFFKmer(5, k3);
+ Assert.assertEquals("ATCTAGC", k2.toString());
+ }
+
+ @Test
+ public void TestMergeThreeVKmerRF_RF() {
+ String test1 = "TAGAT";
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k1.mergeWithRFKmer(5, k2);
+ Assert.assertEquals("CTAGAT", k1.toString());
+ k1.mergeWithRFKmer(5, k3);
+ Assert.assertEquals("GCTAGAT", k1.toString());
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixMapper.java
index 98f561f..833462c 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixMapper.java
@@ -17,6 +17,7 @@
import edu.uci.ics.genomix.type.NodeWritable;
import edu.uci.ics.genomix.type.PositionListWritable;
import edu.uci.ics.genomix.type.PositionWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
@SuppressWarnings("deprecation")
public class GenomixMapper extends MapReduceBase implements
@@ -49,17 +50,18 @@
@Override
public void configure(JobConf job) {
KMER_SIZE = Integer.parseInt(job.get("sizeKmer"));
- preForwardKmer = new KmerBytesWritable(KMER_SIZE);
- preReverseKmer = new KmerBytesWritable(KMER_SIZE);
- curForwardKmer = new KmerBytesWritable(KMER_SIZE);
- curReverseKmer = new KmerBytesWritable(KMER_SIZE);
- nextForwardKmer = new KmerBytesWritable(KMER_SIZE);
- nextReverseKmer = new KmerBytesWritable(KMER_SIZE);
+ KmerBytesWritable.setGlobalKmerLength(KMER_SIZE);
+ preForwardKmer = new KmerBytesWritable();
+ preReverseKmer = new KmerBytesWritable();
+ curForwardKmer = new KmerBytesWritable();
+ curReverseKmer = new KmerBytesWritable();
+ nextForwardKmer = new KmerBytesWritable();
+ nextReverseKmer = new KmerBytesWritable();
nodeId = new PositionWritable();
nodeIdList = new PositionListWritable();
- edgeListForPreKmer = new KmerListWritable(KMER_SIZE);
- edgeListForNextKmer = new KmerListWritable(KMER_SIZE);
- outputNode = new NodeWritable(KMER_SIZE);
+ edgeListForPreKmer = new KmerListWritable();
+ edgeListForNextKmer = new KmerListWritable();
+ outputNode = new NodeWritable();
preKmerDir = KmerDir.FORWARD;
curKmerDir = KmerDir.FORWARD;
nextKmerDir = KmerDir.FORWARD;
@@ -86,7 +88,7 @@
}
/** first kmer **/
- outputNode.reset(KMER_SIZE);
+ outputNode.reset();
curForwardKmer.setByRead(array, 0);
curReverseKmer.setByReadReverse(array, 0);
curKmerDir = curForwardKmer.compareTo(curReverseKmer) <= 0 ? KmerDir.FORWARD : KmerDir.REVERSE;
@@ -100,7 +102,7 @@
/** middle kmer **/
for (int i = KMER_SIZE + 1; i < array.length; i++) {
- outputNode.reset(KMER_SIZE);
+ outputNode.reset();
setPreKmerByOldCurKmer();
setCurKmerByOldNextKmer();
setNextKmer(array[i]);
@@ -114,7 +116,7 @@
}
/** last kmer **/
- outputNode.reset(KMER_SIZE);
+ outputNode.reset();
setPreKmerByOldCurKmer();
setCurKmerByOldNextKmer();
//set value.nodeId
@@ -138,12 +140,12 @@
case FORWARD:
switch(preKmerDir){
case FORWARD:
- edgeListForPreKmer.reset(KMER_SIZE);
+ edgeListForPreKmer.reset();
edgeListForPreKmer.append(preForwardKmer);
outputNode.setRRList(edgeListForPreKmer);
break;
case REVERSE:
- edgeListForPreKmer.reset(KMER_SIZE);
+ edgeListForPreKmer.reset();
edgeListForPreKmer.append(preReverseKmer);
outputNode.setRFList(edgeListForPreKmer);
break;
@@ -152,12 +154,12 @@
case REVERSE:
switch(preKmerDir){
case FORWARD:
- edgeListForPreKmer.reset(KMER_SIZE);
+ edgeListForPreKmer.reset();
edgeListForPreKmer.append(preForwardKmer);
outputNode.setFRList(edgeListForPreKmer);
break;
case REVERSE:
- edgeListForPreKmer.reset(KMER_SIZE);
+ edgeListForPreKmer.reset();
edgeListForPreKmer.append(preReverseKmer);
outputNode.setFFList(edgeListForPreKmer);
break;
@@ -171,12 +173,12 @@
case FORWARD:
switch(nextKmerDir){
case FORWARD:
- edgeListForNextKmer.reset(KMER_SIZE);
+ edgeListForNextKmer.reset();
edgeListForNextKmer.append(nextForwardKmer);
outputNode.setFFList(edgeListForNextKmer);
break;
case REVERSE:
- edgeListForNextKmer.reset(KMER_SIZE);
+ edgeListForNextKmer.reset();
edgeListForNextKmer.append(nextReverseKmer);
outputNode.setFRList(edgeListForNextKmer);
break;
@@ -185,12 +187,12 @@
case REVERSE:
switch(nextKmerDir){
case FORWARD:
- edgeListForNextKmer.reset(KMER_SIZE);
+ edgeListForNextKmer.reset();
edgeListForNextKmer.append(nextForwardKmer);
outputNode.setRFList(edgeListForNextKmer);
break;
case REVERSE:
- edgeListForNextKmer.reset(KMER_SIZE);
+ edgeListForNextKmer.reset();
edgeListForNextKmer.append(nextReverseKmer);
outputNode.setRRList(edgeListForNextKmer);
break;
@@ -201,7 +203,7 @@
//set preKmer by shifting curKmer with preChar
public void setPreKmer(byte preChar){
- preForwardKmer.set(curForwardKmer);
+ preForwardKmer.setAsCopy(curForwardKmer);
preForwardKmer.shiftKmerWithPreChar(preChar);
preReverseKmer.setByReadReverse(preForwardKmer.toString().getBytes(), preForwardKmer.getOffset());
preKmerDir = preForwardKmer.compareTo(preReverseKmer) <= 0 ? KmerDir.FORWARD : KmerDir.REVERSE;
@@ -209,7 +211,7 @@
//set nextKmer by shifting curKmer with nextChar
public void setNextKmer(byte nextChar){
- nextForwardKmer.set(curForwardKmer);
+ nextForwardKmer.setAsCopy(curForwardKmer);
nextForwardKmer.shiftKmerWithNextChar(nextChar);
nextReverseKmer.setByReadReverse(nextForwardKmer.toString().getBytes(), nextForwardKmer.getOffset());
nextKmerDir = nextForwardKmer.compareTo(nextReverseKmer) <= 0 ? KmerDir.FORWARD : KmerDir.REVERSE;
@@ -218,15 +220,15 @@
//old curKmer becomes current preKmer
public void setPreKmerByOldCurKmer(){
preKmerDir = curKmerDir;
- preForwardKmer.set(curForwardKmer);
- preReverseKmer.set(curReverseKmer);
+ preForwardKmer.setAsCopy(curForwardKmer);
+ preReverseKmer.setAsCopy(curReverseKmer);
}
//old nextKmer becomes current curKmer
public void setCurKmerByOldNextKmer(){
curKmerDir = nextKmerDir;
- curForwardKmer.set(nextForwardKmer);
- curReverseKmer.set(nextReverseKmer);
+ curForwardKmer.setAsCopy(nextForwardKmer);
+ curReverseKmer.setAsCopy(nextReverseKmer);
}
public void setMapperOutput(OutputCollector<KmerBytesWritable, NodeWritable> output) throws IOException{
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java
index 1633c26..f39cdcb 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/contrailgraphbuilding/GenomixReducer.java
@@ -23,15 +23,16 @@
@Override
public void configure(JobConf job) {
KMER_SIZE = GenomixMapper.KMER_SIZE;
- outputNode = new NodeWritable(KMER_SIZE);
- tmpNode = new NodeWritable(KMER_SIZE);
+ KmerBytesWritable.setGlobalKmerLength(KMER_SIZE);
+ outputNode = new NodeWritable();
+ tmpNode = new NodeWritable();
}
@Override
public void reduce(KmerBytesWritable key, Iterator<NodeWritable> values,
OutputCollector<KmerBytesWritable, NodeWritable> output,
Reporter reporter) throws IOException {
- outputNode.reset(KMER_SIZE);
+ outputNode.reset();
while (values.hasNext()) {
tmpNode.set(values.next());
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3.java
index 8736fe3..b28328f 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/mergepaths/h3/MergePathsH3.java
@@ -27,6 +27,7 @@
import edu.uci.ics.genomix.hadoop.pmcommon.PathNodeInitial.PathNodeFlag;
import edu.uci.ics.genomix.oldtype.NodeWritable;
import edu.uci.ics.genomix.oldtype.PositionWritable;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
@SuppressWarnings("deprecation")
public class MergePathsH3 extends Configured implements Tool {
@@ -58,12 +59,14 @@
private boolean finalMerge;
public void configure(JobConf conf) {
+ KMER_SIZE = conf.getInt("sizeKmer", 0);
+ KmerBytesWritable.setGlobalKmerLength(KMER_SIZE);
+
randSeed = conf.getLong("randomSeed", 0);
randGenerator = new Random(randSeed);
probBeingRandomHead = conf.getFloat("probBeingRandomHead", 0.5f);
finalMerge = conf.getBoolean("finalMerge", false);
- KMER_SIZE = conf.getInt("sizeKmer", 0);
outputValue = new NodeWithFlagWritable(KMER_SIZE);
outputKey = new PositionWritable();
curNode = new NodeWritable(KMER_SIZE);
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/removetips/RemoveTips.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/removetips/RemoveTips.java
index 83fe200..f1bfec6 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/removetips/RemoveTips.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/graphclean/removetips/RemoveTips.java
@@ -60,7 +60,7 @@
// kill this node by NOT mapping it. Update my neighbors with a suicide note
//TODO: update neighbors by removing me from its list
} else {
- outputValue.set(MergeMessageFlag.MSG_SELF, curNode);
+ outputValue.setAsCopy(MergeMessageFlag.MSG_SELF, curNode);
output.collect(key, value);
}
}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/PositionListAndKmerWritable.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/PositionListAndKmerWritable.java
index 943b505..550cc7c 100644
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/PositionListAndKmerWritable.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/genomix/hadoop/velvetgraphbuilding/PositionListAndKmerWritable.java
@@ -54,12 +54,12 @@
public void set(PositionListAndKmerWritable right) {
this.countOfKmer = right.countOfKmer;
this.vertexIDList.set(right.vertexIDList);
- this.kmer.set(right.kmer);
+ this.kmer.setAsCopy(right.kmer);
}
public void set(PositionListWritable list, KmerBytesWritable kmer) {
this.vertexIDList.set(list);
- this.kmer.set(kmer);
+ this.kmer.setAsCopy(kmer);
}
@Override
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/data/primitive/NodeReference.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/data/primitive/NodeReference.java
index 8f7a69e..60c0682 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/data/primitive/NodeReference.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/data/primitive/NodeReference.java
@@ -15,7 +15,7 @@
package edu.uci.ics.genomix.hyracks.data.primitive;
-import edu.uci.ics.genomix.oldtype.NodeWritable;
+import edu.uci.ics.genomix.velvet.oldtype.NodeWritable;
public class NodeReference extends NodeWritable {
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java
index 0f0aa29..1827651 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/MapReadToNodeOperator.java
@@ -19,9 +19,11 @@
import java.nio.ByteBuffer;
import edu.uci.ics.genomix.hyracks.data.primitive.NodeReference;
-import edu.uci.ics.genomix.oldtype.PositionListWritable;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+import edu.uci.ics.genomix.velvet.oldtype.PositionListWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionWritable;
+import edu.uci.ics.genomix.velvet.oldtype.KmerBytesWritable;
+
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/ReadsKeyValueParserFactory.java
index 9aea9ad..2134177 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/ReadsKeyValueParserFactory.java
@@ -25,8 +25,8 @@
import org.apache.hadoop.io.Text;
import edu.uci.ics.genomix.hyracks.data.primitive.PositionReference;
-import edu.uci.ics.genomix.type.GeneCode;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.velvet.oldtype.GeneCode;
+import edu.uci.ics.genomix.velvet.oldtype.KmerBytesWritable;
import edu.uci.ics.hyracks.api.comm.IFrameWriter;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerSequenceWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerSequenceWriterFactory.java
index f6c2cf9..def046b 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerSequenceWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerSequenceWriterFactory.java
@@ -25,9 +25,9 @@
import org.apache.hadoop.mapred.JobConf;
import edu.uci.ics.genomix.hyracks.job.GenomixJobConf;
-import edu.uci.ics.genomix.oldtype.PositionListWritable;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionListWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionWritable;
+import edu.uci.ics.genomix.velvet.oldtype.KmerBytesWritable;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerTextWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerTextWriterFactory.java
index e7526ed..652a6f2 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerTextWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/KMerTextWriterFactory.java
@@ -17,9 +17,9 @@
import java.io.DataOutput;
import java.io.IOException;
-import edu.uci.ics.genomix.oldtype.PositionListWritable;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionListWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionWritable;
+import edu.uci.ics.genomix.velvet.oldtype.KmerBytesWritable;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeSequenceWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeSequenceWriterFactory.java
index 77efcf8..e116ab9 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeSequenceWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeSequenceWriterFactory.java
@@ -27,8 +27,8 @@
import edu.uci.ics.genomix.data.Marshal;
import edu.uci.ics.genomix.hyracks.dataflow.MapReadToNodeOperator;
import edu.uci.ics.genomix.hyracks.job.GenomixJobConf;
-import edu.uci.ics.genomix.oldtype.NodeWritable;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
+import edu.uci.ics.genomix.velvet.oldtype.NodeWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionWritable;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeTextWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeTextWriterFactory.java
index dffd3a9..bc00aa5 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeTextWriterFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/dataflow/io/NodeTextWriterFactory.java
@@ -18,8 +18,8 @@
import java.io.IOException;
import edu.uci.ics.genomix.data.Marshal;
-import edu.uci.ics.genomix.oldtype.NodeWritable;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
+import edu.uci.ics.genomix.velvet.oldtype.NodeWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionWritable;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenCheckReader.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenCheckReader.java
index c3ec3c7..b4b1e73 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenCheckReader.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenCheckReader.java
@@ -19,8 +19,8 @@
import java.util.Map;
import edu.uci.ics.genomix.hyracks.dataflow.ReadsKeyValueParserFactory;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionWritable;
+import edu.uci.ics.genomix.velvet.oldtype.KmerBytesWritable;
import edu.uci.ics.hyracks.api.client.NodeControllerInfo;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenGroupbyReadID.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenGroupbyReadID.java
index 559060a..1e78b79 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenGroupbyReadID.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenGroupbyReadID.java
@@ -19,8 +19,8 @@
import java.util.Map;
import edu.uci.ics.genomix.data.Marshal;
-import edu.uci.ics.genomix.oldtype.PositionListWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionListWritable;
+import edu.uci.ics.genomix.velvet.oldtype.KmerBytesWritable;
import edu.uci.ics.hyracks.api.client.NodeControllerInfo;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenMapKmerToRead.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenMapKmerToRead.java
index e9aede5..8e727959 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenMapKmerToRead.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/job/JobGenMapKmerToRead.java
@@ -20,8 +20,8 @@
import edu.uci.ics.genomix.data.Marshal;
import edu.uci.ics.genomix.hyracks.dataflow.MapKmerPositionToReadOperator;
-import edu.uci.ics.genomix.oldtype.PositionListWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.velvet.oldtype.PositionListWritable;
+import edu.uci.ics.genomix.velvet.oldtype.KmerBytesWritable;
import edu.uci.ics.hyracks.api.client.NodeControllerInfo;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ConnectorPolicyAssignmentPolicy.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ConnectorPolicyAssignmentPolicy.java
new file mode 100644
index 0000000..6919e76
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ConnectorPolicyAssignmentPolicy.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.hyracks.newgraph.dataflow;
+
+import edu.uci.ics.hyracks.api.dataflow.IConnectorDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.connectors.IConnectorPolicy;
+import edu.uci.ics.hyracks.api.dataflow.connectors.IConnectorPolicyAssignmentPolicy;
+import edu.uci.ics.hyracks.api.dataflow.connectors.PipeliningConnectorPolicy;
+import edu.uci.ics.hyracks.api.dataflow.connectors.SendSideMaterializedPipeliningConnectorPolicy;
+import edu.uci.ics.hyracks.dataflow.std.connectors.MToNPartitioningMergingConnectorDescriptor;
+
+/**
+ * used by precluster groupby
+ */
+public class ConnectorPolicyAssignmentPolicy implements IConnectorPolicyAssignmentPolicy {
+ private static final long serialVersionUID = 1L;
+ private IConnectorPolicy senderSideMaterializePolicy = new SendSideMaterializedPipeliningConnectorPolicy();
+ private IConnectorPolicy pipeliningPolicy = new PipeliningConnectorPolicy();
+
+ @Override
+ public IConnectorPolicy getConnectorPolicyAssignment(IConnectorDescriptor c, int nProducers, int nConsumers,
+ int[] fanouts) {
+ if (c instanceof MToNPartitioningMergingConnectorDescriptor) {
+ return senderSideMaterializePolicy;
+ } else {
+ return pipeliningPolicy;
+ }
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java
index 3650553..84f9fe0 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java
@@ -24,10 +24,12 @@
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
-import edu.uci.ics.genomix.oldtype.IntermediateNodeWritable;
import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerListWritable;
+import edu.uci.ics.genomix.type.NodeWritable;
+import edu.uci.ics.genomix.type.PositionListWritable;
import edu.uci.ics.genomix.type.PositionWritable;
+
import edu.uci.ics.hyracks.api.comm.IFrameWriter;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -44,38 +46,53 @@
private static final Log LOG = LogFactory.getLog(ReadsKeyValueParserFactory.class);
public static final int OutputKmerField = 0;
- public static final int OutputNodeIdField = 1;
- public static final int OutputForwardForwardField = 2;
- public static final int OutputForwardReverseField = 3;
- public static final int OutputReverseForwardField = 4;
- public static final int OutputReverseReverseField = 5;
+ public static final int OutputNodeField = 1;
+
private final int readLength;
private final int kmerSize;
public static final RecordDescriptor readKmerOutputRec = new RecordDescriptor(new ISerializerDeserializer[] { null,
- null });
+ null});
- public ReadsKeyValueParserFactory(int readlength, int k, boolean bGenerateReversed) {
+ public ReadsKeyValueParserFactory(int readlength, int k) {
this.readLength = readlength;
this.kmerSize = k;
}
-
+
+ public static enum KmerDir {
+ FORWARD,
+ REVERSE,
+ }
+
@Override
public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) {
final ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(2);
final ByteBuffer outputBuffer = ctx.allocateFrame();
final FrameTupleAppender outputAppender = new FrameTupleAppender(ctx.getFrameSize());
outputAppender.reset(outputBuffer, true);
-
+ KmerBytesWritable.setGlobalKmerLength(kmerSize);
return new IKeyValueParser<LongWritable, Text>() {
-
- private KmerBytesWritable kmer = new KmerBytesWritable(kmerSize);
- private KmerBytesWritable nextKmer = new KmerBytesWritable(kmerSize);
+
private PositionWritable nodeId = new PositionWritable();
- private KmerListWritable kmerList = new KmerListWritable(kmerSize);
- private IntermediateNodeWritable interMediateNode = new IntermediateNodeWritable();
- private byte mateId = 0;
+ private PositionListWritable nodeIdList = new PositionListWritable();
+ private KmerListWritable edgeListForPreKmer = new KmerListWritable();
+ private KmerListWritable edgeListForNextKmer = new KmerListWritable();
+ private NodeWritable outputNode = new NodeWritable();
+// private NodeWritable outputNode2 = new NodeWritable();
+
+ private KmerBytesWritable preForwardKmer = new KmerBytesWritable();
+ private KmerBytesWritable preReverseKmer = new KmerBytesWritable();
+ private KmerBytesWritable curForwardKmer = new KmerBytesWritable();
+ private KmerBytesWritable curReverseKmer = new KmerBytesWritable();
+ private KmerBytesWritable nextForwardKmer = new KmerBytesWritable();
+ private KmerBytesWritable nextReverseKmer = new KmerBytesWritable();
+
+ private KmerDir preKmerDir = KmerDir.FORWARD;
+ private KmerDir curKmerDir = KmerDir.FORWARD;
+ private KmerDir nextKmerDir = KmerDir.FORWARD;
+
+ byte mateId = (byte) 0;
@Override
public void parse(LongWritable key, Text value, IFrameWriter writer) throws HyracksDataException {
@@ -104,54 +121,148 @@
}
private void SplitReads(int readID, byte[] array, IFrameWriter writer) {
- /** first kmer */
+ /*first kmer*/
if (kmerSize >= array.length) {
return;
}
- kmer.setByRead(array, 0);
- nextKmer.set(kmer);
- nextKmer.shiftKmerWithNextChar(array[kmerSize]);
- kmerList.append(nextKmer);
- nextKmer.toString();
- kmerList.toString();
-// nodeId.set(mateId, readID, 1);
-// interMediateNode.setNodeId(nodeId);
-// interMediateNode.setFFList(kmerList);
- InsertToFrame(kmer, kmerList, writer);
+ outputNode.reset();
+ curForwardKmer.setByRead(array, 0);
+ curReverseKmer.setByReadReverse(array, 0);
+ curKmerDir = curForwardKmer.compareTo(curReverseKmer) <= 0 ? KmerDir.FORWARD : KmerDir.REVERSE;
+ setNextKmer(array[kmerSize]);
+ setnodeId(mateId, readID, 0);
+ setEdgeListForNextKmer();
+ writeToFrame(writer);
- /** middle kmer */
+ /*middle kmer*/
int i = kmerSize;
for (; i < array.length - 1; i++) {
- kmer.shiftKmerWithNextChar(array[i]);
- nextKmer.set(kmer);
- nextKmer.shiftKmerWithNextChar(array[i+1]);
- kmerList.append(nextKmer);
-// nodeId.set(mateId, readID, i - kmerSize + 2);
-// interMediateNode.setNodeId(nodeId);
-// interMediateNode.setFFList(kmerList);
- InsertToFrame(kmer, kmerList, writer);
+ outputNode.reset();
+ setPreKmerByOldCurKmer();
+ setCurKmerByOldNextKmer();
+ setNextKmer(array[i]);
+ setnodeId(mateId, readID, 0);//i - kmerSize + 1
+ setEdgeListForPreKmer();
+ setEdgeListForNextKmer();
+ writeToFrame(writer);
}
-//
-// /** last kmer */
-// kmer.shiftKmerWithNextChar(array[i]);
-// nodeId.set(mateId, readID, i - kmerSize + 2);
-// interMediateNode.setNodeId(nodeId);
-// InsertToFrame(kmer, interMediateNode, writer);
+
+ /*last kmer*/
+ outputNode.reset();
+ setPreKmerByOldCurKmer();
+ setCurKmerByOldNextKmer();
+ setnodeId(mateId, readID, 0);//array.length - kmerSize + 1
+ setEdgeListForPreKmer();
+ writeToFrame(writer);
}
- //IntermediateNodeWritable node
- private void InsertToFrame(KmerBytesWritable kmer, KmerListWritable kmerList, IFrameWriter writer) {
+
+ public void setnodeId(byte mateId, long readID, int posId){
+ nodeId.set(mateId, readID, posId);
+ nodeIdList.reset();
+ nodeIdList.append(nodeId);
+ outputNode.setNodeIdList(nodeIdList);
+ }
+
+ public void setNextKmer(byte nextChar){
+ nextForwardKmer.setAsCopy(curForwardKmer);
+ nextForwardKmer.shiftKmerWithNextChar(nextChar);
+ nextReverseKmer.setByReadReverse(nextForwardKmer.toString().getBytes(), nextForwardKmer.getOffset());
+ nextKmerDir = nextForwardKmer.compareTo(nextReverseKmer) <= 0 ? KmerDir.FORWARD : KmerDir.REVERSE;
+ }
+
+ public void setPreKmerByOldCurKmer(){
+ preKmerDir = curKmerDir;
+ preForwardKmer.setAsCopy(curForwardKmer);
+ preReverseKmer.setAsCopy(curReverseKmer);
+ }
+
+ public void setCurKmerByOldNextKmer(){
+ curKmerDir = nextKmerDir;
+ curForwardKmer.setAsCopy(nextForwardKmer);
+ curReverseKmer.setAsCopy(nextReverseKmer);
+ }
+
+ public void writeToFrame(IFrameWriter writer) {
+ switch(curKmerDir){
+ case FORWARD:
+ InsertToFrame(curForwardKmer, outputNode, writer);
+ break;
+ case REVERSE:
+ InsertToFrame(curReverseKmer, outputNode, writer);
+ break;
+ }
+ }
+ public void setEdgeListForPreKmer(){
+ switch(curKmerDir){
+ case FORWARD:
+ switch(preKmerDir){
+ case FORWARD:
+ edgeListForPreKmer.reset();
+ edgeListForPreKmer.append(preForwardKmer);
+ outputNode.setRRList(edgeListForPreKmer);
+ break;
+ case REVERSE:
+ edgeListForPreKmer.reset();
+ edgeListForPreKmer.append(preReverseKmer);
+ outputNode.setRFList(edgeListForPreKmer);
+ break;
+ }
+ break;
+ case REVERSE:
+ switch(preKmerDir){
+ case FORWARD:
+ edgeListForPreKmer.reset();
+ edgeListForPreKmer.append(preForwardKmer);
+ outputNode.setFRList(edgeListForPreKmer);
+ break;
+ case REVERSE:
+ edgeListForPreKmer.reset();
+ edgeListForPreKmer.append(preReverseKmer);
+ outputNode.setFFList(edgeListForPreKmer);
+ break;
+ }
+ break;
+ }
+ }
+
+ public void setEdgeListForNextKmer(){
+ switch(curKmerDir){
+ case FORWARD:
+ switch(nextKmerDir){
+ case FORWARD:
+ edgeListForNextKmer.reset();
+ edgeListForNextKmer.append(nextForwardKmer);
+ outputNode.setFFList(edgeListForNextKmer);
+ break;
+ case REVERSE:
+ edgeListForNextKmer.reset();
+ edgeListForNextKmer.append(nextReverseKmer);
+ outputNode.setFRList(edgeListForNextKmer);
+ break;
+ }
+ break;
+ case REVERSE:
+ switch(nextKmerDir){
+ case FORWARD:
+ edgeListForNextKmer.reset();
+ edgeListForNextKmer.append(nextForwardKmer);
+ outputNode.setRFList(edgeListForNextKmer);
+ break;
+ case REVERSE:
+ edgeListForNextKmer.reset();
+ edgeListForNextKmer.append(nextReverseKmer);
+ outputNode.setRRList(edgeListForNextKmer);
+ break;
+ }
+ break;
+ }
+ }
+
+ private void InsertToFrame(KmerBytesWritable kmer, NodeWritable node, IFrameWriter writer) {
try {
-// if (Math.abs(node.getNodeId().getPosId()) > 32768) {
-// throw new IllegalArgumentException("Position id is beyond 32768 at " + node.getNodeId().getReadId());
-// }
tupleBuilder.reset();
tupleBuilder.addField(kmer.getBytes(), kmer.getOffset(), kmer.getLength());
- tupleBuilder.addField(kmerList.getByteArray(), kmer.getOffset(), kmer.getLength());
- //tupleBuilder.addField(node.getNodeId().getByteArray(), node.getNodeId().getStartOffset(), node.getNodeId().getLength());
-// tupleBuilder.addField(node.getFFList().getByteArray(), node.getFFList().getStartOffset(), node.getFFList().getLength());
-// tupleBuilder.addField(node.getFRList().getByteArray(), node.getFRList().getStartOffset(), node.getFRList().getLength());
-// tupleBuilder.addField(node.getRFList().getByteArray(), node.getRFList().getStartOffset(), node.getRFList().getLength());
-// tupleBuilder.addField(node.getRRList().getByteArray(), node.getRRList().getStartOffset(), node.getRRList().getLength());
+ tupleBuilder.addField(node.marshalToByteArray(), 0, node.getSerializedLength());
if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
tupleBuilder.getSize())) {
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/aggregators/AggregateKmerAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/aggregators/AggregateKmerAggregateFactory.java
new file mode 100644
index 0000000..46fdd0e
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/aggregators/AggregateKmerAggregateFactory.java
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.hyracks.newgraph.dataflow.aggregators;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.NodeWritable;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+
+public class AggregateKmerAggregateFactory implements IAggregatorDescriptorFactory {
+
+ /**
+ * local Aggregate
+ */
+ private static final long serialVersionUID = 1L;
+ private final int kmerSize;
+
+ public AggregateKmerAggregateFactory(int k) {
+ this.kmerSize = k;
+ }
+
+ @Override
+ public IAggregatorDescriptor createAggregator(IHyracksTaskContext ctx, RecordDescriptor inRecordDescriptor,
+ RecordDescriptor outRecordDescriptor, int[] keyFields, int[] keyFieldsInPartialResults)
+ throws HyracksDataException {
+ KmerBytesWritable.setGlobalKmerLength(kmerSize);
+ return new IAggregatorDescriptor() {
+
+ private NodeWritable readNode = new NodeWritable();
+
+ protected int getOffSet(IFrameTupleAccessor accessor, int tIndex, int fieldId) {
+ int tupleOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldStart = accessor.getFieldStartOffset(tIndex, fieldId);
+ int offset = tupleOffset + fieldStart + accessor.getFieldSlotsLength();
+ return offset;
+ }
+
+ @Override
+ public void reset() {
+ }
+
+ @Override
+ public void close() {
+
+ }
+
+ @Override
+ public AggregateState createAggregateStates() {
+ return new AggregateState(new NodeWritable());
+ }
+
+ @Override
+ public void init(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ NodeWritable localUniNode = (NodeWritable) state.state;
+ localUniNode.reset();
+ readNode.setAsReference(accessor.getBuffer().array(), getOffSet(accessor, tIndex, 1));
+ localUniNode.getNodeIdList().appendList(readNode.getNodeIdList());
+ localUniNode.getFFList().appendList(readNode.getFFList());
+ localUniNode.getFRList().appendList(readNode.getFRList());
+ localUniNode.getRFList().appendList(readNode.getRFList());
+ localUniNode.getRRList().appendList(readNode.getRRList());
+
+ // make an empty field
+ tupleBuilder.addFieldEndOffset();// mark question?
+ }
+
+ @Override
+ public void aggregate(IFrameTupleAccessor accessor, int tIndex, IFrameTupleAccessor stateAccessor,
+ int stateTupleIndex, AggregateState state) throws HyracksDataException {
+ NodeWritable localUniNode = (NodeWritable) state.state;
+ readNode.setAsReference(accessor.getBuffer().array(), getOffSet(accessor, tIndex, 1));
+ localUniNode.getNodeIdList().appendList(readNode.getNodeIdList());
+ localUniNode.getFFList().appendList(readNode.getFFList());
+ localUniNode.getFRList().appendList(readNode.getFRList());
+ localUniNode.getRFList().appendList(readNode.getRFList());
+ localUniNode.getRRList().appendList(readNode.getRRList());
+ }
+
+ @Override
+ public void outputPartialResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ throw new IllegalStateException("partial result method should not be called");
+ }
+
+ @Override
+ public void outputFinalResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ NodeWritable localUniNode = (NodeWritable) state.state;
+ try {
+ fieldOutput.write(localUniNode.marshalToByteArray(), 0, localUniNode.getSerializedLength());
+
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when writing aggregation to the output buffer.");
+ }
+ }
+
+ };
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/aggregators/MergeKmerAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/aggregators/MergeKmerAggregateFactory.java
new file mode 100644
index 0000000..1ee6cae
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/aggregators/MergeKmerAggregateFactory.java
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.hyracks.newgraph.dataflow.aggregators;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.NodeWritable;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+
+public class MergeKmerAggregateFactory implements IAggregatorDescriptorFactory {
+ private static final long serialVersionUID = 1L;
+ private static final Log LOG = LogFactory.getLog(MergeKmerAggregateFactory.class);
+
+ private final int kmerSize;
+
+ public MergeKmerAggregateFactory(int k) {
+ this.kmerSize = k;
+ }
+
+ @Override
+ public IAggregatorDescriptor createAggregator(IHyracksTaskContext ctx, RecordDescriptor inRecordDescriptor,
+ RecordDescriptor outRecordDescriptor, int[] keyFields, int[] keyFieldsInPartialResults)
+ throws HyracksDataException {
+ final int frameSize = ctx.getFrameSize();
+ KmerBytesWritable.setGlobalKmerLength(kmerSize);
+ return new IAggregatorDescriptor() {
+
+ private NodeWritable readNode = new NodeWritable();
+
+ protected int getOffSet(IFrameTupleAccessor accessor, int tIndex, int fieldId) {
+ int tupleOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldStart = accessor.getFieldStartOffset(tIndex, fieldId);
+ int offset = tupleOffset + fieldStart + accessor.getFieldSlotsLength();
+ return offset;
+ }
+
+ @Override
+ public AggregateState createAggregateStates() {
+ return new AggregateState(new NodeWritable());
+ }
+
+ @Override
+ public void init(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ NodeWritable localUniNode = (NodeWritable) state.state;
+ localUniNode.reset();
+ readNode.setAsReference(accessor.getBuffer().array(), getOffSet(accessor, tIndex, 1));
+ localUniNode.getNodeIdList().unionUpdate(readNode.getNodeIdList());
+ localUniNode.getFFList().unionUpdate(readNode.getFFList());
+ localUniNode.getFRList().unionUpdate(readNode.getFRList());
+ localUniNode.getRFList().unionUpdate(readNode.getRFList());
+ localUniNode.getRRList().unionUpdate(readNode.getRRList());
+
+ //make a fake feild to cheat caller
+ tupleBuilder.addFieldEndOffset();
+ }
+
+ @Override
+ public void reset() {
+
+ }
+
+ @Override
+ public void aggregate(IFrameTupleAccessor accessor, int tIndex, IFrameTupleAccessor stateAccessor,
+ int stateTupleIndex, AggregateState state) throws HyracksDataException {
+ NodeWritable localUniNode = (NodeWritable) state.state;
+ readNode.setAsReference(accessor.getBuffer().array(), getOffSet(accessor, tIndex, 1));
+ localUniNode.getNodeIdList().unionUpdate(readNode.getNodeIdList());
+ localUniNode.getFFList().unionUpdate(readNode.getFFList());
+ localUniNode.getFRList().unionUpdate(readNode.getFRList());
+ localUniNode.getRFList().unionUpdate(readNode.getRFList());
+ localUniNode.getRRList().unionUpdate(readNode.getRRList());
+ }
+
+ @Override
+ public void outputPartialResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ throw new IllegalStateException("partial result method should not be called");
+ }
+
+ @Override
+ public void outputFinalResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ NodeWritable localUniNode = (NodeWritable) state.state;
+ try {
+ if (localUniNode.getSerializedLength() > frameSize / 2) {
+ LOG.warn("MergeKmer: output data kmerByteSize is too big: " + localUniNode.getSerializedLength());
+ }
+ fieldOutput.write(localUniNode.marshalToByteArray(), 0, localUniNode.getSerializedLength());
+ tupleBuilder.addFieldEndOffset();
+
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when writing aggregation to the output buffer.");
+ }
+ }
+
+ @Override
+ public void close() {
+
+ }
+
+ };
+
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/driver/Driver.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/driver/Driver.java
index 6d6e1e6..4602ed2 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/driver/Driver.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/driver/Driver.java
@@ -1,3 +1,18 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package edu.uci.ics.genomix.hyracks.newgraph.driver;
import java.net.URL;
@@ -6,10 +21,12 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.GenericOptionsParser;
-import edu.uci.ics.genomix.hyracks.driver.Driver.Plan;
-import edu.uci.ics.genomix.hyracks.job.GenomixJobConf;
-import edu.uci.ics.genomix.hyracks.job.JobGen;
+import edu.uci.ics.genomix.hyracks.newgraph.job.GenomixJobConf;
+import edu.uci.ics.genomix.hyracks.newgraph.job.JobGen;
+import edu.uci.ics.genomix.hyracks.newgraph.job.JobGenBrujinGraph;
import edu.uci.ics.genomix.hyracks.newgraph.job.JobGenCheckReader;
import edu.uci.ics.hyracks.api.client.HyracksConnection;
@@ -23,19 +40,21 @@
public class Driver {
public static enum Plan {
+ BUILD_DEBRUJIN_GRAPH,
CHECK_KMERREADER,
}
-
+
private static final String IS_PROFILING = "genomix.driver.profiling";
private static final String CPARTITION_PER_MACHINE = "genomix.driver.duplicate.num";
private static final Log LOG = LogFactory.getLog(Driver.class);
private JobGen jobGen;
private boolean profiling;
+
private int numPartitionPerMachine;
private IHyracksClientConnection hcc;
private Scheduler scheduler;
-
+
public Driver(String ipAddress, int port, int numPartitionPerMachine) throws HyracksException {
try {
hcc = new HyracksConnection(ipAddress, port);
@@ -45,9 +64,9 @@
}
this.numPartitionPerMachine = numPartitionPerMachine;
}
-
+
public void runJob(GenomixJobConf job) throws HyracksException {
- runJob(job, Plan.CHECK_KMERREADER, false);
+ runJob(job, Plan.BUILD_DEBRUJIN_GRAPH, false);
}
public void runJob(GenomixJobConf job, Plan planChoice, boolean profiling) throws HyracksException {
@@ -69,8 +88,11 @@
Map<String, NodeControllerInfo> ncMap = hcc.getNodeControllerInfos();
LOG.info("ncmap:" + ncMap.size() + " " + ncMap.keySet().toString());
switch (planChoice) {
- case CHECK_KMERREADER:
+ case BUILD_DEBRUJIN_GRAPH:
default:
+ jobGen = new JobGenBrujinGraph(job, scheduler, ncMap, numPartitionPerMachine);
+ break;
+ case CHECK_KMERREADER:
jobGen = new JobGenCheckReader(job, scheduler, ncMap, numPartitionPerMachine);
break;
}
@@ -85,7 +107,7 @@
throw new HyracksException(e);
}
}
-
+
private void run(JobGen jobGen) throws Exception {
try {
JobSpecification createJob = jobGen.generateJob();
@@ -95,11 +117,37 @@
throw e;
}
}
-
+
private void execute(JobSpecification job) throws Exception {
job.setUseConnectorPolicyForScheduling(false);
- JobId jobId = hcc
- .startJob(job, profiling ? EnumSet.of(JobFlag.PROFILE_RUNTIME) : EnumSet.noneOf(JobFlag.class));
+ JobId jobId = hcc.startJob(job, profiling ? EnumSet.of(JobFlag.PROFILE_RUNTIME) : EnumSet.noneOf(JobFlag.class));
hcc.waitForCompletion(jobId);
}
+
+ public static void main(String[] args) throws Exception {
+ GenomixJobConf jobConf = new GenomixJobConf();
+ String[] otherArgs = new GenericOptionsParser(jobConf, args).getRemainingArgs();
+ if (otherArgs.length < 4) {
+ System.err.println("Need <serverIP> <port> <input> <output>");
+ System.exit(-1);
+ }
+ String ipAddress = otherArgs[0];
+ int port = Integer.parseInt(otherArgs[1]);
+ int numOfDuplicate = jobConf.getInt(CPARTITION_PER_MACHINE, 2);
+ boolean bProfiling = jobConf.getBoolean(IS_PROFILING, true);
+ // FileInputFormat.setInputPaths(job, otherArgs[2]);
+ {
+ @SuppressWarnings("deprecation")
+ Path path = new Path(jobConf.getWorkingDirectory(), otherArgs[2]);
+ jobConf.set("mapred.input.dir", path.toString());
+
+ @SuppressWarnings("deprecation")
+ Path outputDir = new Path(jobConf.getWorkingDirectory(), otherArgs[3]);
+ jobConf.set("mapred.output.dir", outputDir.toString());
+ }
+ // FileInputFormat.addInputPath(jobConf, new Path(otherArgs[2]));
+ // FileOutputFormat.setOutputPath(job, new Path(otherArgs[3]));
+ Driver driver = new Driver(ipAddress, port, numOfDuplicate);
+ driver.runJob(jobConf, Plan.BUILD_DEBRUJIN_GRAPH, bProfiling);
+ }
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/io/NodeTextWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/io/NodeTextWriterFactory.java
new file mode 100644
index 0000000..b7b7054
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/io/NodeTextWriterFactory.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.hyracks.newgraph.io;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.genomix.hyracks.newgraph.dataflow.ReadsKeyValueParserFactory;
+import edu.uci.ics.genomix.type.NodeWritable;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriter;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriterFactory;
+
+public class NodeTextWriterFactory implements ITupleWriterFactory {
+
+ /**
+ * Write the node to Text
+ */
+ private static final long serialVersionUID = 1L;
+ private final int kmerSize;
+ public static final int OutputKmerField = ReadsKeyValueParserFactory.OutputKmerField;
+ public static final int outputNodeField = ReadsKeyValueParserFactory.OutputNodeField;
+
+ public NodeTextWriterFactory(int k) {
+ this.kmerSize = k;
+ }
+
+ @Override
+ public ITupleWriter getTupleWriter(IHyracksTaskContext ctx) throws HyracksDataException {
+ KmerBytesWritable.setGlobalKmerLength(kmerSize);
+ return new ITupleWriter() {
+ NodeWritable node = new NodeWritable();
+
+ @Override
+ public void open(DataOutput output) throws HyracksDataException {
+
+ }
+
+ @Override
+ public void write(DataOutput output, ITupleReference tuple) throws HyracksDataException {
+ node.setAsReference(tuple.getFieldData(outputNodeField), tuple.getFieldStart(outputNodeField));
+ node.getKmer().reset(kmerSize);
+ node.getKmer().setAsReference(tuple.getFieldData(OutputKmerField), tuple.getFieldStart(OutputKmerField));
+ try {
+ output.write(node.toString().getBytes());
+ output.writeByte('\n');
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ @Override
+ public void close(DataOutput output) throws HyracksDataException {
+
+ }
+
+ };
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/GenomixJobConf.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/GenomixJobConf.java
new file mode 100644
index 0000000..b0edf77
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/GenomixJobConf.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.hyracks.newgraph.job;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+
+@SuppressWarnings("deprecation")
+public class GenomixJobConf extends JobConf {
+
+ public static final String JOB_NAME = "genomix";
+
+ /** Kmers length */
+ public static final String KMER_LENGTH = "genomix.kmerlen";
+ /** Read length */
+ public static final String READ_LENGTH = "genomix.readlen";
+ /** Frame Size */
+ public static final String FRAME_SIZE = "genomix.framesize";
+ /** Frame Limit, hyracks need */
+ public static final String FRAME_LIMIT = "genomix.framelimit";
+ /** Table Size, hyracks need */
+ public static final String TABLE_SIZE = "genomix.tablesize";
+ /** Groupby types */
+ public static final String GROUPBY_TYPE = "genomix.graph.groupby.type";
+ /** Graph outputformat */
+ public static final String OUTPUT_FORMAT = "genomix.graph.output";
+ /** Get reversed Kmer Sequence */
+ public static final String REVERSED_KMER = "genomix.kmer.reversed";
+
+ /** Configurations used by hybrid groupby function in graph build phrase */
+ public static final String GROUPBY_HYBRID_INPUTSIZE = "genomix.graph.groupby.hybrid.inputsize";
+ public static final String GROUPBY_HYBRID_INPUTKEYS = "genomix.graph.groupby.hybrid.inputkeys";
+ public static final String GROUPBY_HYBRID_RECORDSIZE_SINGLE = "genomix.graph.groupby.hybrid.recordsize.single";
+ public static final String GROUPBY_HYBRID_RECORDSIZE_CROSS = "genomix.graph.groupby.hybrid.recordsize.cross";
+ public static final String GROUPBY_HYBRID_HASHLEVEL = "genomix.graph.groupby.hybrid.hashlevel";
+
+ public static final int DEFAULT_KMERLEN = 21;
+ public static final int DEFAULT_READLEN = 124;
+ public static final int DEFAULT_FRAME_SIZE = 128 * 1024;
+ public static final int DEFAULT_FRAME_LIMIT = 4096;
+ public static final int DEFAULT_TABLE_SIZE = 10485767;
+ public static final long DEFAULT_GROUPBY_HYBRID_INPUTSIZE = 154000000L;
+ public static final long DEFAULT_GROUPBY_HYBRID_INPUTKEYS = 38500000L;
+ public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_SINGLE = 9;
+ public static final int DEFAULT_GROUPBY_HYBRID_HASHLEVEL = 1;
+ public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS = 13;
+
+ public static final boolean DEFAULT_REVERSED = true;
+
+ public static final String JOB_PLAN_GRAPHBUILD = "graphbuild";
+ public static final String JOB_PLAN_GRAPHSTAT = "graphstat";
+
+ public static final String GROUPBY_TYPE_HYBRID = "hybrid";
+ public static final String GROUPBY_TYPE_EXTERNAL = "external";
+ public static final String GROUPBY_TYPE_PRECLUSTER = "precluster";
+ public static final String OUTPUT_FORMAT_BINARY = "binary";
+ public static final String OUTPUT_FORMAT_TEXT = "text";
+
+ public GenomixJobConf() throws IOException {
+ super(new Configuration());
+ }
+
+ public GenomixJobConf(Configuration conf) throws IOException {
+ super(conf);
+ }
+
+ /**
+ * Set the kmer length
+ *
+ * @param the
+ * desired frame kmerByteSize
+ */
+ final public void setKmerLength(int kmerlength) {
+ setInt(KMER_LENGTH, kmerlength);
+ }
+
+ final public void setFrameSize(int frameSize) {
+ setInt(FRAME_SIZE, frameSize);
+ }
+
+ final public void setFrameLimit(int frameLimit) {
+ setInt(FRAME_LIMIT, frameLimit);
+ }
+
+ final public void setTableSize(int tableSize) {
+ setInt(TABLE_SIZE, tableSize);
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGen.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGen.java
new file mode 100644
index 0000000..9649566
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGen.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.hyracks.newgraph.job;
+
+import java.io.Serializable;
+import java.util.UUID;
+
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.exceptions.HyracksException;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.hdfs.dataflow.ConfFactory;
+
+public abstract class JobGen implements Serializable {
+
+ /**
+ * generate the jobId
+ */
+ private static final long serialVersionUID = 1L;
+ protected final ConfFactory confFactory;
+ protected String jobId = new UUID(System.currentTimeMillis(), System.nanoTime()).toString();
+
+ public JobGen(GenomixJobConf job) throws HyracksDataException {
+ this.confFactory = new ConfFactory(job);
+ }
+
+ public abstract JobSpecification generateJob() throws HyracksException;
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenBrujinGraph.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenBrujinGraph.java
index abfff00..afc1cf7 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenBrujinGraph.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenBrujinGraph.java
@@ -24,18 +24,39 @@
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
-import edu.uci.ics.genomix.hyracks.job.GenomixJobConf;
-import edu.uci.ics.genomix.hyracks.job.JobGen;
+import edu.uci.ics.genomix.hyracks.data.accessors.KmerHashPartitioncomputerFactory;
+import edu.uci.ics.genomix.hyracks.data.accessors.KmerNormarlizedComputerFactory;
+import edu.uci.ics.genomix.hyracks.data.primitive.KmerPointable;
+import edu.uci.ics.genomix.hyracks.newgraph.dataflow.ConnectorPolicyAssignmentPolicy;
import edu.uci.ics.genomix.hyracks.newgraph.dataflow.ReadsKeyValueParserFactory;
+import edu.uci.ics.genomix.hyracks.newgraph.dataflow.aggregators.AggregateKmerAggregateFactory;
+import edu.uci.ics.genomix.hyracks.newgraph.dataflow.aggregators.MergeKmerAggregateFactory;
+import edu.uci.ics.genomix.hyracks.newgraph.io.NodeTextWriterFactory;
+
import edu.uci.ics.hyracks.api.client.NodeControllerInfo;
import edu.uci.ics.hyracks.api.constraints.PartitionConstraintHelper;
import edu.uci.ics.hyracks.api.dataflow.IConnectorDescriptor;
import edu.uci.ics.hyracks.api.dataflow.IOperatorDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputerFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputerFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.api.exceptions.HyracksException;
import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.data.std.accessors.PointableBinaryComparatorFactory;
+import edu.uci.ics.hyracks.data.std.api.IPointableFactory;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.connectors.MToNPartitioningMergingConnectorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+import edu.uci.ics.hyracks.dataflow.std.group.preclustered.PreclusteredGroupOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.sort.ExternalSortOperatorDescriptor;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriterFactory;
import edu.uci.ics.hyracks.hdfs.dataflow.ConfFactory;
import edu.uci.ics.hyracks.hdfs.dataflow.HDFSReadOperatorDescriptor;
+import edu.uci.ics.hyracks.hdfs.dataflow.HDFSWriteOperatorDescriptor;
import edu.uci.ics.hyracks.hdfs.scheduler.Scheduler;
@SuppressWarnings("deprecation")
@@ -68,7 +89,7 @@
protected int tableSize;
protected GroupbyType groupbyType;
protected OutputFormat outputFormat;
- protected boolean bGenerateReversedKmer;
+
protected void logDebug(String status) {
LOG.debug(status + " nc nodes:" + ncNodeNames.length);
@@ -85,7 +106,116 @@
}
initJobConfiguration(scheduler);
}
-
+
+ private Object[] generateAggeragateDescriptorbyType(JobSpecification jobSpec, int[] keyFields,
+ IAggregatorDescriptorFactory aggregator, IAggregatorDescriptorFactory merger,
+ ITuplePartitionComputerFactory partition, INormalizedKeyComputerFactory normalizer,
+ IPointableFactory pointable, RecordDescriptor combineRed, RecordDescriptor finalRec)
+ throws HyracksDataException {
+
+ Object[] obj = new Object[3];
+
+ switch (groupbyType) {
+ case PRECLUSTER:
+ default:
+ obj[0] = new PreclusteredGroupOperatorDescriptor(jobSpec, keyFields,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(pointable) }, aggregator,
+ combineRed);
+ obj[1] = new MToNPartitioningMergingConnectorDescriptor(jobSpec, partition, keyFields,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(pointable) });
+ obj[2] = new PreclusteredGroupOperatorDescriptor(jobSpec, keyFields,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(pointable) }, merger,
+ finalRec);
+ jobSpec.setConnectorPolicyAssignmentPolicy(new ConnectorPolicyAssignmentPolicy());
+ break;
+ }
+ return obj;
+ }
+
+ public HDFSReadOperatorDescriptor createHDFSReader(JobSpecification jobSpec) throws HyracksDataException {
+ try {
+ InputSplit[] splits = hadoopJobConfFactory.getConf().getInputFormat()
+ .getSplits(hadoopJobConfFactory.getConf(), ncNodeNames.length);
+
+ return new HDFSReadOperatorDescriptor(jobSpec, ReadsKeyValueParserFactory.readKmerOutputRec,
+ hadoopJobConfFactory.getConf(), splits, readSchedule, new ReadsKeyValueParserFactory(readLength,
+ kmerSize));
+ } catch (Exception e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ public static void connectOperators(JobSpecification jobSpec, IOperatorDescriptor preOp, String[] preNodes,
+ IOperatorDescriptor nextOp, String[] nextNodes, IConnectorDescriptor conn) {
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, preOp, preNodes);
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, nextOp, nextNodes);
+ jobSpec.connect(conn, preOp, 0, nextOp, 0);
+ }
+
+ public AbstractOperatorDescriptor generateGroupbyKmerJob(JobSpecification jobSpec,
+ AbstractOperatorDescriptor readOperator) throws HyracksDataException {
+ int[] keyFields = new int[] { 0 }; // the id of grouped key
+
+ ExternalSortOperatorDescriptor sorter = new ExternalSortOperatorDescriptor(jobSpec, frameLimits, keyFields,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(KmerPointable.FACTORY) },
+ ReadsKeyValueParserFactory.readKmerOutputRec);
+
+ connectOperators(jobSpec, readOperator, ncNodeNames, sorter, ncNodeNames, new OneToOneConnectorDescriptor(
+ jobSpec));
+
+ RecordDescriptor combineKmerOutputRec = new RecordDescriptor(new ISerializerDeserializer[] { null, null });
+ jobSpec.setFrameSize(frameSize);
+
+ Object[] objs = generateAggeragateDescriptorbyType(jobSpec, keyFields, new AggregateKmerAggregateFactory(kmerSize),
+ new MergeKmerAggregateFactory(kmerSize), new KmerHashPartitioncomputerFactory(),
+ new KmerNormarlizedComputerFactory(), KmerPointable.FACTORY, combineKmerOutputRec, combineKmerOutputRec);
+ AbstractOperatorDescriptor kmerLocalAggregator = (AbstractOperatorDescriptor) objs[0];
+ logDebug("LocalKmerGroupby Operator");
+ connectOperators(jobSpec, sorter, ncNodeNames, kmerLocalAggregator, ncNodeNames,
+ new OneToOneConnectorDescriptor(jobSpec));
+
+ logDebug("CrossKmerGroupby Operator");
+ IConnectorDescriptor kmerConnPartition = (IConnectorDescriptor) objs[1];
+ AbstractOperatorDescriptor kmerCrossAggregator = (AbstractOperatorDescriptor) objs[2];
+ connectOperators(jobSpec, kmerLocalAggregator, ncNodeNames, kmerCrossAggregator, ncNodeNames, kmerConnPartition);
+ return kmerCrossAggregator;
+ }
+
+ public AbstractOperatorDescriptor generateNodeWriterOpertator(JobSpecification jobSpec,
+ AbstractOperatorDescriptor mapEachReadToNode) throws HyracksException {
+ ITupleWriterFactory nodeWriter = null;
+ switch (outputFormat) {
+ case TEXT:
+ nodeWriter = new NodeTextWriterFactory(kmerSize);
+ break;
+ }
+ logDebug("WriteOperator");
+ // Output Node
+ HDFSWriteOperatorDescriptor writeNodeOperator = new HDFSWriteOperatorDescriptor(jobSpec,
+ hadoopJobConfFactory.getConf(), nodeWriter);
+ connectOperators(jobSpec, mapEachReadToNode, ncNodeNames, writeNodeOperator, ncNodeNames,
+ new OneToOneConnectorDescriptor(jobSpec));
+ return writeNodeOperator;
+ }
+
+ @Override
+ public JobSpecification generateJob() throws HyracksException {
+
+ JobSpecification jobSpec = new JobSpecification();
+ logDebug("ReadKmer Operator");
+
+ HDFSReadOperatorDescriptor readOperator = createHDFSReader(jobSpec);
+
+ logDebug("Group by Kmer");
+ AbstractOperatorDescriptor lastOperator = generateGroupbyKmerJob(jobSpec, readOperator);
+
+ logDebug("Write node to result");
+ lastOperator = generateNodeWriterOpertator(jobSpec, lastOperator);
+
+ jobSpec.addRoot(readOperator);//what's this? why we need this? why I can't seet it in the JobGenCheckReader
+ return jobSpec;
+ }
+
protected void initJobConfiguration(Scheduler scheduler) throws HyracksDataException {
Configuration conf = confFactory.getConf();
readLength = conf.getInt(GenomixJobConf.READ_LENGTH, GenomixJobConf.DEFAULT_READLEN);
@@ -98,18 +228,11 @@
tableSize = conf.getInt(GenomixJobConf.TABLE_SIZE, GenomixJobConf.DEFAULT_TABLE_SIZE);
frameSize = conf.getInt(GenomixJobConf.FRAME_SIZE, GenomixJobConf.DEFAULT_FRAME_SIZE);
- bGenerateReversedKmer = conf.getBoolean(GenomixJobConf.REVERSED_KMER, GenomixJobConf.DEFAULT_REVERSED);
-
String type = conf.get(GenomixJobConf.GROUPBY_TYPE, GenomixJobConf.GROUPBY_TYPE_PRECLUSTER);
- if (type.equalsIgnoreCase(GenomixJobConf.GROUPBY_TYPE_EXTERNAL)) {
- groupbyType = GroupbyType.EXTERNAL;
- } else if (type.equalsIgnoreCase(GenomixJobConf.GROUPBY_TYPE_PRECLUSTER)) {
- groupbyType = GroupbyType.PRECLUSTER;
- } else {
- groupbyType = GroupbyType.HYBRIDHASH;
- }
+ groupbyType = GroupbyType.PRECLUSTER;
- String output = conf.get(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_BINARY);
+ String output = conf.get(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_TEXT);
+
if (output.equalsIgnoreCase("text")) {
outputFormat = OutputFormat.TEXT;
} else {
@@ -131,30 +254,5 @@
LOG.info("Frame limit" + frameLimits);
LOG.info("Frame kmerByteSize" + frameSize);
}
-
- public HDFSReadOperatorDescriptor createHDFSReader(JobSpecification jobSpec) throws HyracksDataException {
- try {
- InputSplit[] splits = hadoopJobConfFactory.getConf().getInputFormat()
- .getSplits(hadoopJobConfFactory.getConf(), ncNodeNames.length);
- return new HDFSReadOperatorDescriptor(jobSpec, ReadsKeyValueParserFactory.readKmerOutputRec,
- hadoopJobConfFactory.getConf(), splits, readSchedule, new ReadsKeyValueParserFactory(readLength,
- kmerSize, bGenerateReversedKmer));
- } catch (Exception e) {
- throw new HyracksDataException(e);
- }
- }
-
- public static void connectOperators(JobSpecification jobSpec, IOperatorDescriptor preOp, String[] preNodes,
- IOperatorDescriptor nextOp, String[] nextNodes, IConnectorDescriptor conn) {
- PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, preOp, preNodes);
- PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, nextOp, nextNodes);
- jobSpec.connect(conn, preOp, 0, nextOp, 0);
- }
-
- @Override
- public JobSpecification generateJob() throws HyracksException {
- // TODO Auto-generated method stub
- return null;
- }
}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenCheckReader.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenCheckReader.java
index 6026ac1..f512f43 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenCheckReader.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/JobGenCheckReader.java
@@ -18,12 +18,11 @@
import java.io.IOException;
import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
import edu.uci.ics.genomix.hyracks.newgraph.dataflow.ReadsKeyValueParserFactory;
-import edu.uci.ics.genomix.hyracks.job.GenomixJobConf;
-import edu.uci.ics.genomix.oldtype.IntermediateNodeWritable;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
+import edu.uci.ics.genomix.type.NodeWritable;
import edu.uci.ics.genomix.type.KmerBytesWritable;
-import edu.uci.ics.genomix.type.KmerListWritable;
+
import edu.uci.ics.hyracks.api.client.NodeControllerInfo;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -34,12 +33,16 @@
import edu.uci.ics.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor;
import edu.uci.ics.hyracks.hdfs.api.ITupleWriter;
import edu.uci.ics.hyracks.hdfs.api.ITupleWriterFactory;
+import edu.uci.ics.hyracks.hdfs.dataflow.ConfFactory;
import edu.uci.ics.hyracks.hdfs.dataflow.HDFSReadOperatorDescriptor;
import edu.uci.ics.hyracks.hdfs.dataflow.HDFSWriteOperatorDescriptor;
import edu.uci.ics.hyracks.hdfs.scheduler.Scheduler;
public class JobGenCheckReader extends JobGenBrujinGraph {
+ /**
+ *
+ */
private static final long serialVersionUID = 1L;
public JobGenCheckReader(GenomixJobConf job, Scheduler scheduler, Map<String, NodeControllerInfo> ncMap,
@@ -62,7 +65,7 @@
public AbstractSingleActivityOperatorDescriptor generateRootByWriteKmerReader(JobSpecification jobSpec,
HDFSReadOperatorDescriptor readOperator) throws HyracksException {
- // Output Kmer
+
HDFSWriteOperatorDescriptor writeKmerOperator = new HDFSWriteOperatorDescriptor(jobSpec,
hadoopJobConfFactory.getConf(), new ITupleWriterFactory() {
@@ -70,11 +73,11 @@
@Override
public ITupleWriter getTupleWriter(IHyracksTaskContext ctx) throws HyracksDataException {
+ KmerBytesWritable.setGlobalKmerLength(kmerSize);
return new ITupleWriter() {
- private KmerBytesWritable kmer = new KmerBytesWritable(kmerSize);
- private KmerListWritable kmerList = new KmerListWritable();
- //private IntermediateNodeWritable intermediateNode = new IntermediateNodeWritable();
+ private NodeWritable outputNode = new NodeWritable();
+ private KmerBytesWritable outputKmer = new KmerBytesWritable();
@Override
public void open(DataOutput output) throws HyracksDataException {
@@ -83,36 +86,20 @@
@Override
public void write(DataOutput output, ITupleReference tuple) throws HyracksDataException {
try {
- if (kmer.getLength() > tuple
+ if (outputKmer.getLength() > tuple
.getFieldLength(ReadsKeyValueParserFactory.OutputKmerField)) {
throw new IllegalArgumentException("Not enough kmer bytes");
}
- //kemr
- kmer.setNewReference(
+ outputKmer.setAsReference(
tuple.getFieldData(ReadsKeyValueParserFactory.OutputKmerField),
tuple.getFieldStart(ReadsKeyValueParserFactory.OutputKmerField));
- kmerList.setNewReference(tuple.getFieldLength(ReadsKeyValueParserFactory.OutputNodeIdField),
- tuple.getFieldData(ReadsKeyValueParserFactory.OutputNodeIdField),
- tuple.getFieldStart(ReadsKeyValueParserFactory.OutputNodeIdField));
-// //nodeId
-// intermediateNode.getNodeId().setNewReference(tuple.getFieldData(ReadsKeyValueParserFactory.OutputNodeIdField),
-// tuple.getFieldStart(ReadsKeyValueParserFactory.OutputNodeIdField));
- //FF list
-// intermediateNode.getFFList().setNewReference(tuple.getFieldLength(ReadsKeyValueParserFactory.OutputForwardForwardField) / 2 ,
-// tuple.getFieldData(ReadsKeyValueParserFactory.OutputForwardForwardField), tuple.getFieldStart(ReadsKeyValueParserFactory.OutputForwardForwardField));
-// //FR list
-// intermediateNode.getFRList().setNewReference(tuple.getFieldLength(ReadsKeyValueParserFactory.OutputForwardReverseField / kmer.getLength()),
-// tuple.getFieldData(ReadsKeyValueParserFactory.OutputForwardReverseField), tuple.getFieldStart(ReadsKeyValueParserFactory.OutputForwardReverseField));
-// //RF list
-// intermediateNode.getRFList().setNewReference(tuple.getFieldLength(ReadsKeyValueParserFactory.OutputReverseForwardField / kmer.getLength()),
-// tuple.getFieldData(ReadsKeyValueParserFactory.OutputReverseForwardField), tuple.getFieldStart(ReadsKeyValueParserFactory.OutputReverseForwardField));
-// //RR list
-// intermediateNode.getRRList().setNewReference(tuple.getFieldLength(ReadsKeyValueParserFactory.OutputReverseReverseField / kmer.getLength()),
-// tuple.getFieldData(ReadsKeyValueParserFactory.OutputReverseReverseField), tuple.getFieldStart(ReadsKeyValueParserFactory.OutputReverseReverseField));
-//
- output.write(kmer.toString().getBytes());
+ outputNode.setAsReference(
+ tuple.getFieldData(ReadsKeyValueParserFactory.OutputNodeField),
+ tuple.getFieldStart(ReadsKeyValueParserFactory.OutputNodeField));
+
+ output.write(outputKmer.toString().getBytes());
output.writeByte('\t');
- output.write(kmerList.toString().getBytes());
+ output.write(outputNode.toString().getBytes());
output.writeByte('\n');
} catch (IOException e) {
throw new HyracksDataException(e);
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/newgraph/test/JobRun.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/newgraph/test/JobRun.java
index dfae011..25915aa 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/newgraph/test/JobRun.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/newgraph/test/JobRun.java
@@ -8,7 +8,6 @@
import java.io.IOException;
import junit.framework.Assert;
-
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -24,16 +23,16 @@
import org.junit.Before;
import org.junit.Test;
-import edu.uci.ics.genomix.hyracks.job.GenomixJobConf;
+import edu.uci.ics.genomix.hyracks.newgraph.job.GenomixJobConf;
import edu.uci.ics.genomix.hyracks.newgraph.driver.Driver;
import edu.uci.ics.genomix.hyracks.newgraph.driver.Driver.Plan;
import edu.uci.ics.genomix.hyracks.test.TestUtils;
-import edu.uci.ics.genomix.oldtype.NodeWritable;
+//import edu.uci.ics.genomix.oldtype.NodeWritable;
@SuppressWarnings("deprecation")
public class JobRun {
private static final int KmerSize = 5;
- private static final int ReadLength = 8;
+ private static final int ReadLength = 6;
private static final String ACTUAL_RESULT_DIR = "actual";
private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf";
@@ -41,16 +40,7 @@
private static final String HDFS_INPUT_PATH = "/webmap";
private static final String HDFS_OUTPUT_PATH = "/webmap_result";
- private static final String EXPECTED_DIR = "src/test/resources/expected/";
- private static final String EXPECTED_READER_RESULT = EXPECTED_DIR + "result_after_initial_read";
-// private static final String EXPECTED_OUPUT_KMER = EXPECTED_DIR + "result_after_kmerAggregate";
-// private static final String EXPECTED_KMER_TO_READID = EXPECTED_DIR + "result_after_kmer2readId";
-// private static final String EXPECTED_GROUPBYREADID = EXPECTED_DIR + "result_after_readIDAggreage";
-// private static final String EXPECTED_OUPUT_NODE = EXPECTED_DIR + "result_after_generateNode";
-// private static final String EXPECTED_UNMERGED = EXPECTED_DIR + "result_unmerged";
-
private static final String DUMPED_RESULT = ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH + "/merged.txt";
- private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";;
private MiniDFSCluster dfsCluster;
@@ -63,13 +53,22 @@
@Test
public void TestAll() throws Exception {
TestReader();
+// TestGroupby();
}
public void TestReader() throws Exception {
cleanUpReEntry();
conf.set(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_TEXT);
driver.runJob(new GenomixJobConf(conf), Plan.CHECK_KMERREADER, true);
- Assert.assertEquals(true, checkResults(EXPECTED_READER_RESULT, null));
+ dumpResult();
+ }
+
+ public void TestGroupby() throws Exception {
+ conf.set(GenomixJobConf.OUTPUT_FORMAT, GenomixJobConf.OUTPUT_FORMAT_TEXT);
+ cleanUpReEntry();
+ conf.set(GenomixJobConf.GROUPBY_TYPE, GenomixJobConf.GROUPBY_TYPE_PRECLUSTER);
+ driver.runJob(new GenomixJobConf(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+// dumpResult();
}
@Before
@@ -129,54 +128,12 @@
}
}
- private boolean checkResults(String expectedPath, int[] poslistField) throws Exception {
- File dumped = null;
+ private void dumpResult() throws Exception {
String format = conf.get(GenomixJobConf.OUTPUT_FORMAT);
if (GenomixJobConf.OUTPUT_FORMAT_TEXT.equalsIgnoreCase(format)) {
FileUtil.copyMerge(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH),
FileSystem.getLocal(new Configuration()), new Path(DUMPED_RESULT), false, conf, null);
- dumped = new File(DUMPED_RESULT);
- } else {
-
- FileSystem.getLocal(new Configuration()).mkdirs(new Path(ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH));
- File filePathTo = new File(CONVERT_RESULT);
- BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
- for (int i = 0; i < numPartitionPerMachine * numberOfNC; i++) {
- String partname = "/part-" + i;
- // FileUtil.copy(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH
- // + partname), FileSystem.getLocal(new Configuration()),
- // new Path(ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH + partname),
- // false, conf);
-
- Path path = new Path(HDFS_OUTPUT_PATH + partname);
- FileSystem dfs = FileSystem.get(conf);
- if (dfs.getFileStatus(path).getLen() == 0) {
- continue;
- }
- SequenceFile.Reader reader = new SequenceFile.Reader(dfs, path, conf);
-
- NodeWritable node = new NodeWritable(conf.getInt(GenomixJobConf.KMER_LENGTH, KmerSize));
- NullWritable value = NullWritable.get();
- while (reader.next(node, value)) {
- if (node == null) {
- break;
- }
- bw.write(node.toString());
- System.out.println(node.toString());
- bw.newLine();
- }
- reader.close();
- }
- bw.close();
- dumped = new File(CONVERT_RESULT);
- }
-
- if (poslistField != null) {
- TestUtils.compareWithUnSortedPosition(new File(expectedPath), dumped, poslistField);
- } else {
- TestUtils.compareWithSortedResult(new File(expectedPath), dumped);
- }
- return true;
+ }
}
@After
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/test/JobRunStepByStepTest.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/test/JobRunStepByStepTest.java
index bd761a5..51a0d15 100644
--- a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/test/JobRunStepByStepTest.java
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/hyracks/test/JobRunStepByStepTest.java
@@ -41,7 +41,7 @@
import edu.uci.ics.genomix.hyracks.driver.Driver;
import edu.uci.ics.genomix.hyracks.driver.Driver.Plan;
import edu.uci.ics.genomix.hyracks.job.GenomixJobConf;
-import edu.uci.ics.genomix.oldtype.NodeWritable;
+import edu.uci.ics.genomix.velvet.oldtype.NodeWritable;
@SuppressWarnings("deprecation")
public class JobRunStepByStepTest {
@@ -75,11 +75,11 @@
@Test
public void TestAll() throws Exception {
- TestReader();
+// TestReader();
// TestGroupbyKmer();
// TestMapKmerToRead();
// TestGroupByReadID();
-// TestEndToEnd();
+ TestEndToEnd();
// TestUnMergedNode();
}
diff --git a/genomix/genomix-hyracks/src/test/resources/data/webmap/test1.txt b/genomix/genomix-hyracks/src/test/resources/data/webmap/test1.txt
index 17770fa..3f1cd5c 100644
--- a/genomix/genomix-hyracks/src/test/resources/data/webmap/test1.txt
+++ b/genomix/genomix-hyracks/src/test/resources/data/webmap/test1.txt
@@ -1 +1 @@
-1 AATAGAAG
+1 AATAGA
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GraphCleanInputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GraphCleanInputFormat.java
index e36e344..e8a72ce 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GraphCleanInputFormat.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/GraphCleanInputFormat.java
@@ -60,7 +60,7 @@
/**
* set the src vertex id
*/
- vertexId.set(getRecordReader().getCurrentKey());
+ vertexId.setAsCopy(getRecordReader().getCurrentKey());
vertex.setVertexId(vertexId);
/**
* set the vertex value
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/InitialGraphCleanInputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/InitialGraphCleanInputFormat.java
index 0d685de..4dfff11 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/InitialGraphCleanInputFormat.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/InitialGraphCleanInputFormat.java
@@ -11,6 +11,7 @@
import edu.uci.ics.pregelix.api.io.VertexReader;
import edu.uci.ics.pregelix.api.util.BspUtils;
import edu.uci.ics.genomix.type.NodeWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
import edu.uci.ics.genomix.pregelix.io.MessageWritable;
import edu.uci.ics.genomix.pregelix.io.VertexValueWritable;
import edu.uci.ics.genomix.pregelix.io.VertexValueWritable.State;
@@ -64,19 +65,20 @@
/**
* set the src vertex id
*/
- vertexId.set(getRecordReader().getCurrentKey());
+ vertexId.setAsCopy(getRecordReader().getCurrentKey());
vertex.setVertexId(vertexId);
/**
* set the vertex value
*/
node.set(getRecordReader().getCurrentValue());
- vertexValue.setKmerlength(node.getKmerlength());
+ vertexValue.setKmerlength(node.getKmerLength());
vertexValue.setNodeIdList(node.getNodeIdList());
vertexValue.setFFList(node.getFFList());
vertexValue.setFRList(node.getFRList());
vertexValue.setRFList(node.getRFList());
vertexValue.setRRList(node.getRRList());
- vertexValue.setKmer(getRecordReader().getCurrentKey());
+ // TODO make this more efficient (don't use toString)
+ vertexValue.setKmer(new VKmerBytesWritable(getRecordReader().getCurrentKey().toString()));
vertexValue.setState(State.IS_NON);
vertex.setVertexValue(vertexValue);
}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/AdjacencyListWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/AdjacencyListWritable.java
index c35ad7f..b19a0cf 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/AdjacencyListWritable.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/AdjacencyListWritable.java
@@ -19,13 +19,13 @@
}
public AdjacencyListWritable(int kmerSize){
- forwardList = new KmerListWritable(kmerSize);
- reverseList = new KmerListWritable(kmerSize);
+ forwardList = new KmerListWritable();
+ reverseList = new KmerListWritable();
}
public void set(AdjacencyListWritable adjacencyList){
- forwardList.set(adjacencyList.getForwardList());
- reverseList.set(adjacencyList.getReverseList());
+ forwardList.setCopy(adjacencyList.getForwardList());
+ reverseList.setCopy(adjacencyList.getReverseList());
}
public void reset(){
@@ -34,8 +34,8 @@
}
public void reset(int kmerSize){
- forwardList.reset(kmerSize);
- reverseList.reset(kmerSize);
+ forwardList.reset();
+ reverseList.reset();
}
public int getCountOfPosition(){
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MergeBubbleMessageWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MergeBubbleMessageWritable.java
index c3b4710..9fd15dd 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MergeBubbleMessageWritable.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MergeBubbleMessageWritable.java
@@ -27,7 +27,7 @@
public MergeBubbleMessageWritable() {
sourceVertexId = new PositionWritable();
- chainVertexId = new KmerBytesWritable(0);
+ chainVertexId = new KmerBytesWritable();
neighberNode = new AdjacencyListWritable();
startVertexId = new PositionWritable();
message = Message.NON;
@@ -42,7 +42,7 @@
}
if (chainVertexId != null) {
checkMessage |= CheckMessage.CHAIN;
- this.chainVertexId.set(msg.getChainVertexId());
+ this.chainVertexId.setAsCopy(msg.getChainVertexId());
}
if (neighberNode != null) {
checkMessage |= CheckMessage.NEIGHBER;
@@ -63,7 +63,7 @@
}
if (chainVertexId != null) {
checkMessage |= CheckMessage.CHAIN;
- this.chainVertexId.set(chainVertexId);
+ this.chainVertexId.setAsCopy(chainVertexId);
}
if (neighberNode != null) {
checkMessage |= CheckMessage.NEIGHBER;
@@ -78,7 +78,7 @@
public void reset() {
checkMessage = 0;
- chainVertexId.reset(1);
+// chainVertexId.reset();
neighberNode.reset();
message = Message.NON;
}
@@ -101,7 +101,7 @@
public void setChainVertexId(KmerBytesWritable chainVertexId) {
if (chainVertexId != null) {
checkMessage |= CheckMessage.CHAIN;
- this.chainVertexId.set(chainVertexId);
+ this.chainVertexId.setAsCopy(chainVertexId);
}
}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MessageWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MessageWritable.java
index 66dd474..e3cd345 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MessageWritable.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/MessageWritable.java
@@ -10,6 +10,7 @@
import edu.uci.ics.genomix.pregelix.type.Message;
import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.PositionListWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
public class MessageWritable implements WritableComparable<MessageWritable> {
/**
@@ -18,7 +19,7 @@
* file stores the point to the file that stores the chains of connected DNA
*/
private KmerBytesWritable sourceVertexId;
- private KmerBytesWritable kmer;
+ private VKmerBytesWritable kmer;
private AdjacencyListWritable neighberNode; //incoming or outgoing
private PositionListWritable nodeIdList = new PositionListWritable();
private byte flag;
@@ -30,7 +31,7 @@
public MessageWritable() {
sourceVertexId = new KmerBytesWritable();
- kmer = new KmerBytesWritable(0);
+ kmer = new VKmerBytesWritable();
neighberNode = new AdjacencyListWritable();
flag = Message.NON;
isFlip = false;
@@ -40,7 +41,7 @@
public MessageWritable(int kmerSize) {
kmerlength = kmerSize;
sourceVertexId = new KmerBytesWritable();
- kmer = new KmerBytesWritable(0);
+ kmer = new VKmerBytesWritable();
neighberNode = new AdjacencyListWritable(kmerSize);
flag = Message.NON;
isFlip = false;
@@ -52,11 +53,11 @@
checkMessage = 0;
if (sourceVertexId != null) {
checkMessage |= CheckMessage.SOURCE;
- this.sourceVertexId.set(msg.getSourceVertexId());
+ this.sourceVertexId.setAsCopy(msg.getSourceVertexId());
}
if (kmer != null) {
checkMessage |= CheckMessage.CHAIN;
- this.kmer.set(msg.getActualKmer());
+ this.kmer.setAsCopy(msg.getActualKmer());
}
if (neighberNode != null) {
checkMessage |= CheckMessage.NEIGHBER;
@@ -72,11 +73,11 @@
checkMessage = 0;
if (sourceVertexId != null) {
checkMessage |= CheckMessage.SOURCE;
- this.sourceVertexId.set(sourceVertexId);
+ this.sourceVertexId.setAsCopy(sourceVertexId);
}
if (chainVertexId != null) {
checkMessage |= CheckMessage.CHAIN;
- this.kmer.set(chainVertexId);
+ this.kmer.setAsCopy(new VKmerBytesWritable(chainVertexId.toString())); // TODO Vkmer
}
if (neighberNode != null) {
checkMessage |= CheckMessage.NEIGHBER;
@@ -92,7 +93,7 @@
public void reset(int kmerSize) {
checkMessage = (byte) 0;
kmerlength = kmerSize;
- kmer.reset(1);
+// kmer.reset();
neighberNode.reset(kmerSize);
flag = Message.NON;
isFlip = false;
@@ -105,29 +106,29 @@
public void setSourceVertexId(KmerBytesWritable sourceVertexId) {
if (sourceVertexId != null) {
checkMessage |= CheckMessage.SOURCE;
- this.sourceVertexId.set(sourceVertexId);
+ this.sourceVertexId.setAsCopy(sourceVertexId);
}
}
- public KmerBytesWritable getActualKmer() {
+ public VKmerBytesWritable getActualKmer() {
return kmer;
}
- public void setAcutalKmer(KmerBytesWritable actualKmer) {
+ public void setActualKmer(VKmerBytesWritable actualKmer) {
if (actualKmer != null) {
checkMessage |= CheckMessage.CHAIN;
- this.kmer.set(actualKmer);
+ this.kmer.setAsCopy(new VKmerBytesWritable(actualKmer.toString()));
}
}
- public KmerBytesWritable getCreatedVertexId() {
+ public VKmerBytesWritable getCreatedVertexId() {
return kmer;
}
public void setCreatedVertexId(KmerBytesWritable actualKmer) {
if (actualKmer != null) {
checkMessage |= CheckMessage.CHAIN;
- this.kmer.set(actualKmer);
+ this.kmer.setAsCopy(new VKmerBytesWritable(actualKmer.toString()));
}
}
@@ -143,7 +144,7 @@
}
public int getLengthOfChain() {
- return kmer.getKmerLength();
+ return kmer.getKmerLetterLength();
}
public byte getFlag() {
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java
index c6ff206..9cdac8f 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java
@@ -7,6 +7,7 @@
import edu.uci.ics.genomix.type.PositionListWritable;
import edu.uci.ics.genomix.pregelix.type.MessageFlag;
import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
import edu.uci.ics.genomix.type.KmerListWritable;
public class VertexValueWritable implements WritableComparable<VertexValueWritable> {
@@ -48,7 +49,7 @@
private AdjacencyListWritable incomingList;
private AdjacencyListWritable outgoingList;
private byte state;
- private KmerBytesWritable kmer;
+ private VKmerBytesWritable kmer;
private int kmerlength = 0;
private boolean isFakeVertex = false;
@@ -62,12 +63,12 @@
incomingList = new AdjacencyListWritable();
outgoingList = new AdjacencyListWritable();
state = State.IS_NON;
- kmer = new KmerBytesWritable(kmerSize);
+ kmer = new VKmerBytesWritable();
}
public VertexValueWritable(PositionListWritable nodeIdList, KmerListWritable forwardForwardList, KmerListWritable forwardReverseList,
KmerListWritable reverseForwardList, KmerListWritable reverseReverseList,
- byte state, KmerBytesWritable kmer) {
+ byte state, VKmerBytesWritable kmer) {
set(nodeIdList, forwardForwardList, forwardReverseList,
reverseForwardList, reverseReverseList,
state, kmer);
@@ -75,14 +76,14 @@
public void set(PositionListWritable nodeIdList, KmerListWritable forwardForwardList, KmerListWritable forwardReverseList,
KmerListWritable reverseForwardList, KmerListWritable reverseReverseList,
- byte state, KmerBytesWritable kmer) {
- this.kmerlength = kmer.kmerByteSize;
+ byte state, VKmerBytesWritable kmer) {
+ this.kmerlength = kmer.getKmerLetterLength();
this.incomingList.setForwardList(reverseForwardList);
this.incomingList.setReverseList(reverseReverseList);
this.outgoingList.setForwardList(forwardForwardList);
this.outgoingList.setReverseList(forwardReverseList);
this.state = state;
- this.kmer.set(kmer);
+ this.kmer.setAsCopy(kmer);
}
public void set(VertexValueWritable value) {
@@ -166,17 +167,17 @@
}
public int getLengthOfKmer() {
- return kmer.getKmerLength();
+ return kmer.getKmerLetterLength();
}
- public KmerBytesWritable getKmer() {
+ public VKmerBytesWritable getKmer() {
return kmer;
}
- public void setKmer(KmerBytesWritable kmer) {
- this.kmer.set(kmer);
+ public void setKmer(VKmerBytesWritable kmer) {
+ this.kmer.setAsCopy(kmer);
}
-
+
public int getKmerlength() {
return kmerlength;
}
@@ -188,11 +189,11 @@
public void reset(int kmerSize) {
this.kmerlength = kmerSize;
this.nodeIdList.reset();
- this.incomingList.getForwardList().reset(kmerSize);
- this.incomingList.getReverseList().reset(kmerSize);
- this.outgoingList.getForwardList().reset(kmerSize);
- this.outgoingList.getReverseList().reset(kmerSize);
- this.kmer.reset(0);
+ this.incomingList.getForwardList().reset();
+ this.incomingList.getReverseList().reset();
+ this.outgoingList.getForwardList().reset();
+ this.outgoingList.getReverseList().reset();
+// this.kmer.reset(0);
}
@Override
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/DataLoadLogFormatter.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/DataLoadLogFormatter.java
index bb60a25..95e070f 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/DataLoadLogFormatter.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/DataLoadLogFormatter.java
@@ -4,7 +4,7 @@
import java.util.logging.Handler;
import java.util.logging.LogRecord;
-import edu.uci.ics.genomix.oldtype.NodeWritable;
+import edu.uci.ics.genomix.type.NodeWritable;
public class DataLoadLogFormatter extends Formatter {
private NodeWritable key;
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/LogAlgorithmLogFormatter.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/LogAlgorithmLogFormatter.java
index 82ca03f..dd78cde 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/LogAlgorithmLogFormatter.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/LogAlgorithmLogFormatter.java
@@ -12,11 +12,11 @@
// Create a DateFormat to format the logger timestamp.
//
private long step;
- private KmerBytesWritable sourceVertexId = new KmerBytesWritable(1);
- private KmerBytesWritable destVertexId = new KmerBytesWritable(1);
+ private KmerBytesWritable sourceVertexId = new KmerBytesWritable();
+ private KmerBytesWritable destVertexId = new KmerBytesWritable();
private MessageWritable msg = new MessageWritable();
private byte state;
- private KmerBytesWritable mergeChain = new KmerBytesWritable(1);;
+ private KmerBytesWritable mergeChain = new KmerBytesWritable();
//private boolean testDelete = false;
/**
* 0: general operation
@@ -32,8 +32,8 @@
public void set(long step, KmerBytesWritable sourceVertexId, KmerBytesWritable destVertexId,
MessageWritable msg, byte state) {
this.step = step;
- this.sourceVertexId.set(sourceVertexId);
- this.destVertexId.set(destVertexId);
+ this.sourceVertexId.setAsCopy(sourceVertexId);
+ this.destVertexId.setAsCopy(destVertexId);
this.msg = msg;
this.state = state;
this.operation = 0;
@@ -42,24 +42,24 @@
public void setMergeChain(long step, KmerBytesWritable sourceVertexId, KmerBytesWritable mergeChain) {
this.reset();
this.step = step;
- this.sourceVertexId.set(sourceVertexId);
- this.mergeChain.set(mergeChain);
+ this.sourceVertexId.setAsCopy(sourceVertexId);
+ this.mergeChain.setAsCopy(mergeChain);
this.operation = 2;
}
public void setVotoToHalt(long step, KmerBytesWritable sourceVertexId) {
this.reset();
this.step = step;
- this.sourceVertexId.set(sourceVertexId);
+ this.sourceVertexId.setAsCopy(sourceVertexId);
this.operation = 3;
}
public void reset() {
- this.sourceVertexId = new KmerBytesWritable(1);
- this.destVertexId = new KmerBytesWritable(1);
+ this.sourceVertexId = new KmerBytesWritable();
+ this.destVertexId = new KmerBytesWritable();
this.msg = new MessageWritable();
this.state = 0;
- this.mergeChain = new KmerBytesWritable(1);
+ this.mergeChain = new KmerBytesWritable();
}
public String format(LogRecord record) {
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/NaiveAlgorithmLogFormatter.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/NaiveAlgorithmLogFormatter.java
index 4a5850a..6b23074 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/NaiveAlgorithmLogFormatter.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/NaiveAlgorithmLogFormatter.java
@@ -15,8 +15,8 @@
public void set(long step, KmerBytesWritable sourceVertexId, KmerBytesWritable destVertexId) {
this.step = step;
- this.sourceVertexId.set(sourceVertexId);
- this.destVertexId.set(destVertexId);
+ this.sourceVertexId.setAsCopy(sourceVertexId);
+ this.destVertexId.setAsCopy(destVertexId);
}
public String format(LogRecord record) {
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bridgeremove/BridgeAddVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bridgeremove/BridgeAddVertex.java
index fa353d0..89b66e6 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bridgeremove/BridgeAddVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bridgeremove/BridgeAddVertex.java
@@ -3,7 +3,7 @@
import java.util.Iterator;
import org.apache.hadoop.io.NullWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+//import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.pregelix.api.graph.Vertex;
import edu.uci.ics.pregelix.api.job.PregelixJob;
import edu.uci.ics.pregelix.api.util.BspUtils;
@@ -14,6 +14,7 @@
import edu.uci.ics.genomix.pregelix.format.GraphCleanOutputFormat;
import edu.uci.ics.genomix.pregelix.io.MessageWritable;
import edu.uci.ics.genomix.pregelix.io.VertexValueWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
/*
* vertexId: BytesWritable
@@ -87,7 +88,7 @@
* set the vertex value
*/
byte[] array = { 'T', 'A', 'G', 'C', 'C'};
- KmerBytesWritable kmer = new KmerBytesWritable(array.length);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(array.length);
kmer.setByRead(array, 0);
vertexValue.setKmer(kmer);
PositionListWritable plist = new PositionListWritable();
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/BubbleAddVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/BubbleAddVertex.java
index ebb4f74..c0ba1a9 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/BubbleAddVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/bubblemerge/BubbleAddVertex.java
@@ -4,6 +4,7 @@
import org.apache.hadoop.io.NullWritable;
import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
import edu.uci.ics.pregelix.api.graph.Vertex;
import edu.uci.ics.pregelix.api.job.PregelixJob;
import edu.uci.ics.pregelix.api.util.BspUtils;
@@ -83,7 +84,7 @@
* set the vertex value
*/
byte[] array = { 'T', 'A', 'G', 'C', 'C', 'A', 'G'}; //TAGCCAG
- KmerBytesWritable kmer = new KmerBytesWritable(array.length);
+ VKmerBytesWritable kmer = new VKmerBytesWritable(array.length);
kmer.setByRead(array, 0);
vertexValue.setKmer(kmer);
PositionListWritable plist = new PositionListWritable();
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicGraphCleanVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicGraphCleanVertex.java
index ca91edb..64965e3 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicGraphCleanVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicGraphCleanVertex.java
@@ -14,13 +14,14 @@
import edu.uci.ics.genomix.pregelix.type.MessageFlag;
import edu.uci.ics.genomix.pregelix.util.VertexUtil;
import edu.uci.ics.genomix.type.GeneCode;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+//import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
/**
* Naive Algorithm for path merge graph
*/
public class BasicGraphCleanVertex extends
- Vertex<KmerBytesWritable, VertexValueWritable, NullWritable, MessageWritable> {
+ Vertex<VKmerBytesWritable, VertexValueWritable, NullWritable, MessageWritable> {
public static final String KMER_SIZE = "BasicGraphCleanVertex.kmerSize";
public static final String ITERATIONS = "BasicGraphCleanVertex.iteration";
public static int kmerSize = -1;
@@ -28,9 +29,9 @@
protected MessageWritable incomingMsg = null;
protected MessageWritable outgoingMsg = null;
- protected KmerBytesWritable destVertexId = new KmerBytesWritable();
- protected Iterator<KmerBytesWritable> posIterator;
- protected KmerBytesWritable tmpKmer = new KmerBytesWritable(kmerSize);
+ protected VKmerBytesWritable destVertexId = new VKmerBytesWritable();
+ protected Iterator<VKmerBytesWritable> posIterator;
+ protected VKmerBytesWritable tmpKmer = new VKmerBytesWritable();
byte headFlag;
protected byte outFlag;
protected byte inFlag;
@@ -67,7 +68,7 @@
/**
* get destination vertex
*/
- public KmerBytesWritable getNextDestVertexId(VertexValueWritable value) {
+ public VKmerBytesWritable getNextDestVertexId(VertexValueWritable value) {
if (value.getFFList().getCountOfPosition() > 0){ // #FFList() > 0
posIterator = value.getFFList().iterator();
return posIterator.next();
@@ -79,7 +80,7 @@
}
}
- public KmerBytesWritable getPreDestVertexId(VertexValueWritable value) {
+ public VKmerBytesWritable getPreDestVertexId(VertexValueWritable value) {
if (value.getRFList().getCountOfPosition() > 0){ // #RFList() > 0
posIterator = value.getRFList().iterator();
return posIterator.next();
@@ -133,12 +134,12 @@
public void sendMsgToAllNextNodes(VertexValueWritable value) {
posIterator = value.getFFList().iterator(); // FFList
while(posIterator.hasNext()){
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
posIterator = value.getFRList().iterator(); // FRList
while(posIterator.hasNext()){
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
}
@@ -149,12 +150,12 @@
public void sendMsgToAllPreviousNodes(VertexValueWritable value) {
posIterator = value.getRFList().iterator(); // RFList
while(posIterator.hasNext()){
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
posIterator = value.getRRList().iterator(); // RRList
while(posIterator.hasNext()){
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
}
@@ -175,14 +176,14 @@
while(posIterator.hasNext()){
outgoingMsg.setFlag(AdjMessage.FROMRF);
outgoingMsg.setSourceVertexId(getVertexId());
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
posIterator = value.getRRList().iterator(); // RRList
while(posIterator.hasNext()){
outgoingMsg.setFlag(AdjMessage.FROMRR);
outgoingMsg.setSourceVertexId(getVertexId());
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
}
@@ -195,14 +196,14 @@
while(posIterator.hasNext()){
outgoingMsg.setFlag(AdjMessage.FROMFF);
outgoingMsg.setSourceVertexId(getVertexId());
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
posIterator = value.getFRList().iterator(); // FRList
while(posIterator.hasNext()){
outgoingMsg.setFlag(AdjMessage.FROMFR);
outgoingMsg.setSourceVertexId(getVertexId());
- destVertexId.set(posIterator.next());
+ destVertexId.setAsCopy(posIterator.next());
sendMsg(destVertexId, outgoingMsg);
}
}
@@ -380,7 +381,7 @@
outgoingMsg.setFlag(outFlag);
outgoingMsg.setNeighberNode(getVertexValue().getIncomingList());
outgoingMsg.setSourceVertexId(getVertexId());
- outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
+ outgoingMsg.setActualKmer(getVertexValue().getKmer());
sendMsg(incomingMsg.getSourceVertexId(), outgoingMsg); //getNextDestVertexId(getVertexValue())
break;
case MessageFlag.DIR_RF:
@@ -393,7 +394,7 @@
outgoingMsg.setFlag(outFlag);
outgoingMsg.setNeighberNode(getVertexValue().getOutgoingList());
outgoingMsg.setSourceVertexId(getVertexId());
- outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
+ outgoingMsg.setActualKmer(getVertexValue().getKmer());
sendMsg(incomingMsg.getSourceVertexId(), outgoingMsg); //getPreDestVertexId(getVertexValue())
break;
}
@@ -418,7 +419,7 @@
outgoingMsg.setFlag(outFlag);
outgoingMsg.setNeighberNode(getVertexValue().getIncomingList());
outgoingMsg.setSourceVertexId(getVertexId());
- outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
+ outgoingMsg.setActualKmer(getVertexValue().getKmer());
sendMsg(incomingMsg.getSourceVertexId(), outgoingMsg); //getNextDestVertexId(getVertexValue())
break;
case MessageFlag.DIR_RF:
@@ -431,7 +432,7 @@
outgoingMsg.setFlag(outFlag);
outgoingMsg.setNeighberNode(getVertexValue().getOutgoingList());
outgoingMsg.setSourceVertexId(getVertexId());
- outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
+ outgoingMsg.setActualKmer(getVertexValue().getKmer());
sendMsg(incomingMsg.getSourceVertexId(), outgoingMsg); //getPreDestVertexId(getVertexValue())
break;
}
@@ -454,7 +455,7 @@
outgoingMsg.setFlag(outFlag);
outgoingMsg.setNeighberNode(getVertexValue().getIncomingList());
outgoingMsg.setSourceVertexId(getVertexId());
- outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
+ outgoingMsg.setActualKmer(getVertexValue().getKmer());
sendMsg(getNextDestVertexId(getVertexValue()), outgoingMsg);
deleteVertex(getVertexId());
break;
@@ -467,7 +468,7 @@
outgoingMsg.setFlag(outFlag);
outgoingMsg.setNeighberNode(getVertexValue().getOutgoingList());
outgoingMsg.setSourceVertexId(getVertexId());
- outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
+ outgoingMsg.setActualKmer(getVertexValue().getKmer());
sendMsg(getPreDestVertexId(getVertexValue()), outgoingMsg);
deleteVertex(getVertexId());
break;
@@ -627,7 +628,7 @@
match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
msgString = msg.getActualKmer().toString();
index = msgString.indexOf(match);
- tmpKmer.reset(msgString.length() - index);
+// tmpKmer.reset(msgString.length() - index);
tmpKmer.setByRead(msgString.substring(index).getBytes(), 0);
break;
case MessageFlag.DIR_FR:
@@ -635,7 +636,7 @@
match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
msgString = GeneCode.reverseComplement(msg.getActualKmer().toString());
index = msgString.indexOf(match);
- tmpKmer.reset(msgString.length() - index);
+// tmpKmer.reset(msgString.length() - index);
tmpKmer.setByReadReverse(msgString.substring(index).getBytes(), 0);
break;
case MessageFlag.DIR_RF:
@@ -643,7 +644,7 @@
match = selfString.substring(0,kmerSize - 1);
msgString = GeneCode.reverseComplement(msg.getActualKmer().toString());
index = msgString.lastIndexOf(match) + kmerSize - 2;
- tmpKmer.reset(index + 1);
+// tmpKmer.reset(index + 1);
tmpKmer.setByReadReverse(msgString.substring(0, index + 1).getBytes(), 0);
break;
case MessageFlag.DIR_RR:
@@ -651,7 +652,7 @@
match = selfString.substring(0,kmerSize - 1);
msgString = msg.getActualKmer().toString();
index = msgString.lastIndexOf(match) + kmerSize - 2;
- tmpKmer.reset(index + 1);
+// tmpKmer.reset(index + 1); // TODO: fix ALL of these resets (only if you need to)
tmpKmer.setByRead(msgString.substring(0, index + 1).getBytes(), 0);
break;
}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/MapReduceVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/MapReduceVertex.java
index a12c583..ecd3c4f 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/MapReduceVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/MapReduceVertex.java
@@ -12,8 +12,9 @@
import edu.uci.ics.genomix.pregelix.io.VertexValueWritable;
import edu.uci.ics.genomix.pregelix.io.VertexValueWritable.State;
import edu.uci.ics.genomix.pregelix.type.MessageFlag;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+//import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerListWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
import edu.uci.ics.pregelix.api.graph.Vertex;
import edu.uci.ics.pregelix.api.job.PregelixJob;
import edu.uci.ics.pregelix.api.util.BspUtils;
@@ -22,11 +23,11 @@
BasicGraphCleanVertex {
public static boolean fakeVertexExist = false;
- protected static KmerBytesWritable fakeVertex = null;
+ protected static VKmerBytesWritable fakeVertex = null;
- protected KmerBytesWritable reverseKmer;
+ protected VKmerBytesWritable reverseKmer;
protected KmerListWritable kmerList = null;
- protected Map<KmerBytesWritable, KmerListWritable> kmerMapper = new HashMap<KmerBytesWritable, KmerListWritable>();
+ protected Map<VKmerBytesWritable, KmerListWritable> kmerMapper = new HashMap<VKmerBytesWritable, KmerListWritable>();
/**
* initiate kmerSize, maxIteration
@@ -43,13 +44,14 @@
else
outgoingMsg.reset(kmerSize);
if(reverseKmer == null)
- reverseKmer = new KmerBytesWritable(kmerSize);
+ reverseKmer = new VKmerBytesWritable();
if(kmerList == null)
- kmerList = new KmerListWritable(kmerSize);
+ kmerList = new KmerListWritable();
else
- kmerList.reset(kmerSize);
+ kmerList.reset();
if(fakeVertex == null){
- fakeVertex = new KmerBytesWritable(kmerSize + 1);
+// fakeVertex = new KmerBytesWritable(kmerSize + 1); // TODO check if merge is correct
+ fakeVertex = new KmerBytesWritable();
String random = generaterRandomString(kmerSize + 1);
fakeVertex.setByRead(random.getBytes(), 0);
}
@@ -94,7 +96,7 @@
public void sendMsgToFakeVertex(){
if(!getVertexValue().isFakeVertex()){
outgoingMsg.setSourceVertexId(getVertexId());
- outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
+ outgoingMsg.setActualKmer(getVertexValue().getKmer());
sendMsg(fakeVertex, outgoingMsg);
voteToHalt();
}
@@ -105,18 +107,18 @@
incomingMsg = msgIterator.next();
String kmerString = incomingMsg.getActualKmer().toString();
tmpKmer.reset(kmerString.length());
- reverseKmer.reset(kmerString.length());
+// reverseKmer.reset(kmerString.length());//kmerbyteswritable
tmpKmer.setByRead(kmerString.getBytes(), 0);
reverseKmer.setByReadReverse(kmerString.getBytes(), 0);
if(reverseKmer.compareTo(tmpKmer) < 0)
- tmpKmer.set(reverseKmer);
+ tmpKmer.setAsCopy(reverseKmer);
if(!kmerMapper.containsKey(tmpKmer)){
kmerList.reset();
kmerList.append(incomingMsg.getSourceVertexId());
kmerMapper.put(tmpKmer, kmerList);
} else{
- kmerList.set(kmerMapper.get(tmpKmer));
+ kmerList.setCopy(kmerMapper.get(tmpKmer));
kmerList.append(incomingMsg.getSourceVertexId());
kmerMapper.put(tmpKmer, kmerList);
}
@@ -124,12 +126,12 @@
}
public void reduceKeyByActualKmer(){
- for(KmerBytesWritable key : kmerMapper.keySet()){
+ for(VKmerBytesWritable key : kmerMapper.keySet()){
kmerList = kmerMapper.get(key);
for(int i = 1; i < kmerList.getCountOfPosition(); i++){
//send kill message
outgoingMsg.setFlag(MessageFlag.KILL);
- destVertexId.set(kmerList.getPosition(i));
+ destVertexId.setAsCopy(kmerList.getPosition(i));
sendMsg(destVertexId, outgoingMsg);
}
}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P1ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P1ForPathMergeVertex.java
index 31fb897..3447f25 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P1ForPathMergeVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P1ForPathMergeVertex.java
@@ -3,7 +3,7 @@
import java.util.Iterator;
import org.apache.hadoop.io.NullWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+//import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerBytesWritableFactory;
import edu.uci.ics.pregelix.api.graph.Vertex;
@@ -60,7 +60,7 @@
private MessageWritable outgoingMsg = new MessageWritable();
private KmerBytesWritableFactory kmerFactory = new KmerBytesWritableFactory(1);
- private KmerBytesWritable lastKmer = new KmerBytesWritable(1);
+ private VKmerBytesWritable lastKmer = new VKmerBytesWritable();
private PositionWritable destVertexId = new PositionWritable();
private Iterator<PositionWritable> posIterator;
@@ -184,7 +184,7 @@
*/
public void mergeChainVertex() {
//merge chain
- lastKmer.set(kmerFactory.getLastKmerFromChain(incomingMsg.getLengthOfChain() - kmerSize + 1,
+ lastKmer.setAsCopy(kmerFactory.getLastKmerFromChain(incomingMsg.getLengthOfChain() - kmerSize + 1,
incomingMsg.getActualKmer()));
getVertexValue().setKmer(kmerFactory.mergeTwoKmer(getVertexValue().getKmer(), lastKmer));
getVertexValue().setOutgoingList(incomingMsg.getNeighberNode());
@@ -225,7 +225,7 @@
outgoingMsg.setAcutalKmer(getVertexValue().getKmer());
if (getVertexValue().getState() == State.IS_HEAD)//is_tail
outgoingMsg.setFlag(Message.STOP);
- destVertexId.set(incomingMsg.getSourceVertexId());
+ destVertexId.setAsCopy(incomingMsg.getSourceVertexId());
sendMsg(destVertexId, outgoingMsg);
}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P2ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P2ForPathMergeVertex.java
index b7ab62b..ef39a23 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P2ForPathMergeVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P2ForPathMergeVertex.java
@@ -14,6 +14,7 @@
import edu.uci.ics.genomix.pregelix.type.MessageFromHead;
import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerListWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
/*
* vertexId: BytesWritable
* vertexValue: VertexValueWritable
@@ -67,13 +68,14 @@
outgoingMsg.reset(kmerSize);
receivedMsgList.clear();
if(reverseKmer == null)
- reverseKmer = new KmerBytesWritable(kmerSize);
+ reverseKmer = new VKmerBytesWritable();
if(kmerList == null)
- kmerList = new KmerListWritable(kmerSize);
+ kmerList = new KmerListWritable();
else
- kmerList.reset(kmerSize);
+ kmerList.reset();
if(fakeVertex == null){
- fakeVertex = new KmerBytesWritable(kmerSize + 1);
+// fakeVertex = new KmerBytesWritable(kmerSize + 1);
+ fakeVertex = new VKmerBytesWritable();
String random = generaterRandomString(kmerSize + 1);
fakeVertex.setByRead(random.getBytes(), 0);
}
@@ -87,7 +89,7 @@
//send wantToMerge to next
tmpKmer = getNextDestVertexIdAndSetFlag(getVertexValue());
if(tmpKmer != null){
- destVertexId.set(tmpKmer);
+ destVertexId.setAsCopy(tmpKmer);
outgoingMsg.setFlag(outFlag);
outgoingMsg.setSourceVertexId(getVertexId());
sendMsg(destVertexId, outgoingMsg);
@@ -96,7 +98,7 @@
//send wantToMerge to prev
tmpKmer = getPreDestVertexIdAndSetFlag(getVertexValue());
if(tmpKmer != null){
- destVertexId.set(tmpKmer);
+ destVertexId.setAsCopy(tmpKmer);
outgoingMsg.setFlag(outFlag);
outgoingMsg.setSourceVertexId(getVertexId());
sendMsg(destVertexId, outgoingMsg);
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P3ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P3ForPathMergeVertex.java
index d806094..cf35c7a 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P3ForPathMergeVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P3ForPathMergeVertex.java
@@ -3,7 +3,7 @@
import java.util.Iterator;
import org.apache.hadoop.io.NullWritable;
-import edu.uci.ics.genomix.type.KmerBytesWritable;
+//import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerBytesWritableFactory;
import edu.uci.ics.pregelix.api.graph.Vertex;
@@ -64,7 +64,7 @@
private MessageWritable outgoingMsg = new MessageWritable();
private KmerBytesWritableFactory kmerFactory = new KmerBytesWritableFactory(1);
- private KmerBytesWritable lastKmer = new KmerBytesWritable(1);
+ private VKmerBytesWritable lastKmer = new VKmerBytesWritable(1);
private PositionWritable destVertexId = new PositionWritable();
private Iterator<PositionWritable> posIterator;
@@ -230,7 +230,7 @@
* merge chain vertex
*/
public void mergeChainVertex(){
- lastKmer.set(kmerFactory.getLastKmerFromChain(incomingMsg.getLengthOfChain() - kmerSize + 1,
+ lastKmer.setAsCopy(kmerFactory.getLastKmerFromChain(incomingMsg.getLengthOfChain() - kmerSize + 1,
incomingMsg.getActualKmer()));
getVertexValue().setKmer(
kmerFactory.mergeTwoKmer(getVertexValue().getKmer(),
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java
index 8f97c5a..ecfafa7 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P4ForPathMergeVertex.java
@@ -110,12 +110,12 @@
*/
protected boolean setNextInfo(VertexValueWritable value) {
if (value.getFFList().getCountOfPosition() > 0) {
- nextKmer.set(value.getFFList().getPosition(0));
+ nextKmer.setAsCopy(value.getFFList().getPosition(0));
nextHead = isNodeRandomHead(nextKmer);
return true;
}
if (value.getFRList().getCountOfPosition() > 0) {
- nextKmer.set(value.getFRList().getPosition(0));
+ nextKmer.setAsCopy(value.getFRList().getPosition(0));
nextHead = isNodeRandomHead(nextKmer);
return true;
}
@@ -127,12 +127,12 @@
*/
protected boolean setPrevInfo(VertexValueWritable value) {
if (value.getRRList().getCountOfPosition() > 0) {
- prevKmer.set(value.getRRList().getPosition(0));
+ prevKmer.setAsCopy(value.getRRList().getPosition(0));
prevHead = isNodeRandomHead(prevKmer);
return true;
}
if (value.getRFList().getCountOfPosition() > 0) {
- prevKmer.set(value.getRFList().getPosition(0));
+ prevKmer.setAsCopy(value.getRFList().getPosition(0));
prevHead = isNodeRandomHead(prevKmer);
return true;
}
@@ -155,7 +155,7 @@
setStateAsNoMerge();
// only PATH vertices are present. Find the ID's for my neighbors
- curKmer.set(getVertexId());
+ curKmer.setAsCopy(getVertexId());
curHead = isNodeRandomHead(curKmer);
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P5ForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P5ForPathMergeVertex.java
index 8a2a301..3f91ac1 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P5ForPathMergeVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/P5ForPathMergeVertex.java
@@ -118,7 +118,7 @@
return true;
}
if (value.getFRList().getCountOfPosition() > 0) {
- nextID.set(value.getFRList().getPosition(0));
+ nextID.setAsCopy(value.getFRList().getPosition(0));
nextHead = isNodeRandomHead(nextID);
return true;
}
@@ -130,12 +130,12 @@
*/
protected boolean setPrevInfo(VertexValueWritable value) {
if (value.getRRList().getCountOfPosition() > 0) {
- prevID.set(value.getRRList().getPosition(0));
+ prevID.setAsCopy(value.getRRList().getPosition(0));
prevHead = isNodeRandomHead(prevID);
return true;
}
if (value.getRFList().getCountOfPosition() > 0) {
- prevID.set(value.getRFList().getPosition(0));
+ prevID.setAsCopy(value.getRFList().getPosition(0));
prevHead = isNodeRandomHead(prevID);
return true;
}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/splitrepeat/SplitRepeatVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/splitrepeat/SplitRepeatVertex.java
index 300d7b0..8546bb6 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/splitrepeat/SplitRepeatVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/splitrepeat/SplitRepeatVertex.java
@@ -14,6 +14,7 @@
import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerListWritable;
import edu.uci.ics.genomix.type.PositionWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
import edu.uci.ics.pregelix.api.graph.Vertex;
import edu.uci.ics.pregelix.api.util.BspUtils;
@@ -21,18 +22,18 @@
BasicGraphCleanVertex{
public class CreatedVertex{
- KmerBytesWritable createdVertexId;
+ VKmerBytesWritable createdVertexId;
String incomingDir;
String outgoingDir;
- KmerBytesWritable incomingEdge;
- KmerBytesWritable outgoingEdge;
+ VKmerBytesWritable incomingEdge;
+ VKmerBytesWritable outgoingEdge;
public CreatedVertex(){
- createdVertexId = new KmerBytesWritable(kmerSize);
+ createdVertexId = new VKmerBytesWritable(kmerSize);
incomingDir = "";
outgoingDir = "";
- incomingEdge = new KmerBytesWritable(kmerSize);
- outgoingEdge = new KmerBytesWritable(kmerSize);
+ incomingEdge = new VKmerBytesWritable(kmerSize);
+ outgoingEdge = new VKmerBytesWritable(kmerSize);
}
public void clear(){
@@ -43,7 +44,7 @@
outgoingEdge.reset(kmerSize);
}
- public KmerBytesWritable getCreatedVertexId() {
+ public VKmerBytesWritable getCreatedVertexId() {
return createdVertexId;
}
@@ -67,7 +68,7 @@
this.outgoingDir = outgoingDir;
}
- public KmerBytesWritable getIncomingEdge() {
+ public VKmerBytesWritable getIncomingEdge() {
return incomingEdge;
}
@@ -75,7 +76,7 @@
this.incomingEdge.set(incomingEdge);
}
- public KmerBytesWritable getOutgoingEdge() {
+ public VKmerBytesWritable getOutgoingEdge() {
return outgoingEdge;
}
@@ -99,8 +100,8 @@
private Set<Long> outgoingEdgeIntersection = new HashSet<Long>();
private Set<Long> neighborEdgeIntersection = new HashSet<Long>();
private Map<KmerBytesWritable, Set<Long>> kmerMap = new HashMap<KmerBytesWritable, Set<Long>>();
- private KmerListWritable incomingEdgeList = null;
- private KmerListWritable outgoingEdgeList = null;
+ private VKmerListWritable incomingEdgeList = null;
+ private VKmerListWritable outgoingEdgeList = null;
private byte incomingEdgeDir = 0;
private byte outgoingEdgeDir = 0;
@@ -123,11 +124,11 @@
else
outgoingMsg.reset(kmerSize);
if(incomingEdgeList == null)
- incomingEdgeList = new KmerListWritable(kmerSize);
+ incomingEdgeList = new VKmerListWritable(kmerSize);
if(outgoingEdgeList == null)
- outgoingEdgeList = new KmerListWritable(kmerSize);
+ outgoingEdgeList = new VKmerListWritable(kmerSize);
if(createdVertexId == null)
- createdVertexId = new KmerBytesWritable(kmerSize + 1);
+ createdVertexId = new VKmerBytesWritable(kmerSize + 1);
}
/**
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/tipremove/TipRemoveVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/tipremove/TipRemoveVertex.java
index c1138de..b4f2407 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/tipremove/TipRemoveVertex.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/tipremove/TipRemoveVertex.java
@@ -71,7 +71,7 @@
else if(getVertexValue().getFRList().getCountOfPosition() > 0)
outgoingMsg.setFlag(AdjMessage.FROMFR);
outgoingMsg.setSourceVertexId(getVertexId());
- destVertexId.set(getNextDestVertexId(getVertexValue()));
+ destVertexId.setAsCopy(getNextDestVertexId(getVertexValue()));
sendMsg(destVertexId, outgoingMsg);
deleteVertex(getVertexId());
}
@@ -83,7 +83,7 @@
else if(getVertexValue().getRRList().getCountOfPosition() > 0)
outgoingMsg.setFlag(AdjMessage.FROMRR);
outgoingMsg.setSourceVertexId(getVertexId());
- destVertexId.set(getPreDestVertexId(getVertexValue()));
+ destVertexId.setAsCopy(getPreDestVertexId(getVertexValue()));
sendMsg(destVertexId, outgoingMsg);
deleteVertex(getVertexId());
}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/ConvertNodeToIdValue.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/ConvertNodeToIdValue.java
index fa5ae19..13d9d98 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/ConvertNodeToIdValue.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/ConvertNodeToIdValue.java
@@ -11,8 +11,8 @@
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
-import edu.uci.ics.genomix.oldtype.NodeWritable;
-import edu.uci.ics.genomix.oldtype.PositionWritable;
+import edu.uci.ics.genomix.type.NodeWritable;
+import edu.uci.ics.genomix.type.PositionWritable;
import edu.uci.ics.genomix.pregelix.io.VertexValueWritable;
import edu.uci.ics.genomix.pregelix.io.VertexValueWritable.State;
@@ -33,8 +33,8 @@
VertexValueWritable outputValue = new VertexValueWritable();
while(reader.next(node, value)) {
- System.out.println(node.getNodeID().toString());
- outputKey.set(node.getNodeID());
+// System.out.println(node.getNodeID().toString());
+// outputKey.set(node.getNodeID());
outputValue.setFFList(node.getFFList());
outputValue.setFRList(node.getFRList());
outputValue.setRFList(node.getRFList());
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateSmallFile.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateSmallFile.java
index 6b9eb4e..d3180c8 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateSmallFile.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateSmallFile.java
@@ -28,7 +28,8 @@
SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, inFile, conf);
SequenceFile.Writer writer = SequenceFile.createWriter(fileSys, conf, outFile, KmerBytesWritable.class,
NullWritable.class, CompressionType.NONE);
- KmerBytesWritable outKey = new KmerBytesWritable(55);
+ KmerBytesWritable.setGlobalKmerLength(55);
+ KmerBytesWritable outKey = new KmerBytesWritable();
int i = 0;
for (i = 0; i < numOfLines; i++) {
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateTextFile.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateTextFile.java
index bc08600..8618237 100644
--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateTextFile.java
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateTextFile.java
@@ -19,6 +19,7 @@
public class GenerateTextFile {
public static void generateFromPathmergeResult(int kmerSize, String strSrcDir, String outPutDir) throws IOException {
+ KmerBytesWritable.setGlobalKmerLength(kmerSize);
Configuration conf = new Configuration();
FileSystem fileSys = FileSystem.getLocal(conf);
@@ -44,13 +45,14 @@
}
public static void generateSpecificLengthChainFromNaivePathmergeResult(int maxLength) throws IOException {
+ KmerBytesWritable.setGlobalKmerLength(55);
BufferedWriter bw = new BufferedWriter(new FileWriter("naive_text_" + maxLength));
Configuration conf = new Configuration();
FileSystem fileSys = FileSystem.get(conf);
for (int i = 0; i < 2; i++) {
Path path = new Path("/home/anbangx/genomix_result/final_naive/part-" + i);
SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, path, conf);
- KmerBytesWritable key = new KmerBytesWritable(55);
+ KmerBytesWritable key = new KmerBytesWritable();
VertexValueWritable value = new VertexValueWritable();
while (reader.next(key, value)) {
@@ -68,13 +70,14 @@
}
public static void generateSpecificLengthChainFromLogPathmergeResult(int maxLength) throws IOException {
+ KmerBytesWritable.setGlobalKmerLength(55);
BufferedWriter bw = new BufferedWriter(new FileWriter("log_text_" + maxLength));
Configuration conf = new Configuration();
FileSystem fileSys = FileSystem.get(conf);
for (int i = 0; i < 2; i++) {
Path path = new Path("/home/anbangx/genomix_result/improvelog2/part-" + i);
SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, path, conf);
- KmerBytesWritable key = new KmerBytesWritable(55);
+ KmerBytesWritable key = new KmerBytesWritable();
VertexValueWritable value = new VertexValueWritable();
while (reader.next(key, value)) {
@@ -93,12 +96,13 @@
}
public static void generateFromGraphbuildResult() throws IOException {
+ KmerBytesWritable.setGlobalKmerLength(55);
BufferedWriter bw = new BufferedWriter(new FileWriter("textfile"));
Configuration conf = new Configuration();
FileSystem fileSys = FileSystem.get(conf);
Path path = new Path("data/input/part-0-out-3000000");
SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, path, conf);
- KmerBytesWritable key = new KmerBytesWritable(55);
+ KmerBytesWritable key = new KmerBytesWritable();
while (reader.next(key, null)) {
if (key == null) {
diff --git a/patch.diff b/patch.diff
new file mode 100644
index 0000000..a333970
--- /dev/null
+++ b/patch.diff
@@ -0,0 +1,245 @@
+From 9e006501f9e33467a8428199bd94b71dbff063ef Mon Sep 17 00:00:00 2001
+From: Anbang Xu <anbangx@gmail.com>
+Date: Fri, 26 Jul 2013 14:10:33 -0700
+Subject: [PATCH] p2 pass all the tests except 9
+
+---
+ .../genomix/data/test/KmerBytesWritableTest.java | 76 +++++++++++++++++++++-
+ .../genomix/pregelix/io/VertexValueWritable.java | 2 +-
+ .../operator/pathmerge/BasicPathMergeVertex.java | 35 +++++-----
+ .../pathmerge/LogAlgorithmForPathMergeVertex.java | 8 +--
+ .../pregelix/JobRun/PathMergeSmallTestSuite.java | 2 +-
+ 5 files changed, 98 insertions(+), 25 deletions(-)
+
+diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
+index bda73e5..fbfbeeb 100644
+--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
++++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
+@@ -229,14 +229,34 @@ public class KmerBytesWritableTest {
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAACAACCC", merge.toString());
+
+- String test1 = "CTA";
+- String test2 = "AGA";
++ String test1;
++ String test2;
++ test1 = "CTA";
++ test2 = "AGA";
+ KmerBytesWritable k1 = new KmerBytesWritable(3);
+ KmerBytesWritable k2 = new KmerBytesWritable(3);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k1.mergeWithRFKmer(3, k2);
+ Assert.assertEquals("TCTA", k1.toString());
++
++ test1 = "CTA";
++ test2 = "ATA"; //TAT
++ k1 = new KmerBytesWritable(3);
++ k2 = new KmerBytesWritable(3);
++ k1.setByRead(test1.getBytes(), 0);
++ k2.setByRead(test2.getBytes(), 0);
++ k1.mergeWithFRKmer(3, k2);
++ Assert.assertEquals("CTAT", k1.toString());
++
++ test1 = "ATA";
++ test2 = "CTA"; //TAT
++ k1 = new KmerBytesWritable(3);
++ k2 = new KmerBytesWritable(3);
++ k1.setByRead(test1.getBytes(), 0);
++ k2.setByRead(test2.getBytes(), 0);
++ k1.mergeWithFRKmer(3, k2);
++ Assert.assertEquals("ATAG", k1.toString());
+ }
+
+
+@@ -281,5 +301,55 @@ public class KmerBytesWritableTest {
+ }
+ }
+ }
+-
++
++ @Test
++ public void TestFinalMerge() {
++ String selfString;
++ String match;
++ String msgString;
++ int index;
++ KmerBytesWritable kmer = new KmerBytesWritable();
++ int kmerSize = 3;
++
++ String F1 = "AATAG";
++ String F2 = "TAGAA";
++ String R1 = "CTATT";
++ String R2 = "TTCTA";
++
++ //FF test
++ selfString = F1;
++ match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
++ msgString = F2;
++ index = msgString.indexOf(match);
++ kmer.reset(msgString.length() - index);
++ kmer.setByRead(msgString.substring(index).getBytes(), 0);
++ System.out.println(kmer.toString());
++
++ //FR test
++ selfString = F1;
++ match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
++ msgString = GeneCode.reverseComplement(R2);
++ index = msgString.indexOf(match);
++ kmer.reset(msgString.length() - index);
++ kmer.setByRead(msgString.substring(index).getBytes(), 0);
++ System.out.println(kmer.toString());
++
++ //RF test
++ selfString = R1;
++ match = selfString.substring(0,kmerSize - 1);
++ msgString = GeneCode.reverseComplement(F2);
++ index = msgString.lastIndexOf(match) + kmerSize - 2;
++ kmer.reset(index + 1);
++ kmer.setByReadReverse(msgString.substring(0, index + 1).getBytes(), 0);
++ System.out.println(kmer.toString());
++
++ //RR test
++ selfString = R1;
++ match = selfString.substring(0,kmerSize - 1);
++ msgString = R2;
++ index = msgString.lastIndexOf(match) + kmerSize - 2;
++ kmer.reset(index + 1);
++ kmer.setByRead(msgString.substring(0, index + 1).getBytes(), 0);
++ System.out.println(kmer.toString());
++ }
+ }
+diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java
+index 6d4f683..065bfd5 100644
+--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java
++++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/VertexValueWritable.java
+@@ -32,7 +32,7 @@ public class VertexValueWritable implements WritableComparable<VertexValueWritab
+ public static final byte SHOULD_MERGEWITHNEXT = 0b01 << 3;
+ public static final byte SHOULD_MERGEWITHPREV = 0b10 << 3;
+ public static final byte SHOULD_MERGE_MASK = 0b11 << 3;
+- public static final byte SHOULD_MERGE_CLEAR = 0b1110011;
++ public static final byte SHOULD_MERGE_CLEAR = 0b1100111;
+ }
+
+ private PositionListWritable nodeIdList;
+diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java
+index b7b0814..ec608c5 100644
+--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java
++++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/BasicPathMergeVertex.java
+@@ -495,6 +495,7 @@ public class BasicPathMergeVertex extends
+
+ public void setStateAsMergeWithNext(){
+ byte state = getVertexValue().getState();
++ state &= State.SHOULD_MERGE_CLEAR;
+ state |= State.SHOULD_MERGEWITHNEXT;
+ getVertexValue().setState(state);
+ }
+@@ -512,6 +513,7 @@ public class BasicPathMergeVertex extends
+
+ public void setStateAsMergeWithPrev(){
+ byte state = getVertexValue().getState();
++ state &= State.SHOULD_MERGE_CLEAR;
+ state |= State.SHOULD_MERGEWITHPREV;
+ getVertexValue().setState(state);
+ }
+@@ -638,7 +640,7 @@ public class BasicPathMergeVertex extends
+ String match;
+ String msgString;
+ int index;
+- switch(neighborToMergeDir){
++ switch(neighborToMeDir){
+ case MessageFlag.DIR_FF:
+ selfString = getVertexValue().getKmer().toString();
+ match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
+@@ -648,28 +650,29 @@ public class BasicPathMergeVertex extends
+ kmer.setByRead(msgString.substring(index).getBytes(), 0);
+ break;
+ case MessageFlag.DIR_FR:
+- selfString = getVertexId().toString();
+- match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
++ selfString = getVertexValue().getKmer().toString();
++ match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
+ msgString = GeneCode.reverseComplement(msg.getKmer().toString());
+ index = msgString.indexOf(match);
+ kmer.reset(msgString.length() - index);
+- kmer.setByRead(msgString.substring(index).getBytes(), 0);
++ kmer.setByReadReverse(msgString.substring(index).getBytes(), 0);
+ break;
+ case MessageFlag.DIR_RF:
+- selfString = GeneCode.reverseComplement(getVertexValue().getKmer().toString());
+- match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
+- msgString = msg.getKmer().toString();
+- index = msgString.indexOf(match);
+- kmer.reset(msgString.length() - index);
+- kmer.setByRead(msgString.substring(index).getBytes(), 0);
++ selfString = getVertexValue().getKmer().toString();
++ match = selfString.substring(0,kmerSize - 1);
++ msgString = GeneCode.reverseComplement(msg.getKmer().toString());
++ index = msgString.lastIndexOf(match) + kmerSize - 2;
++ kmer.reset(index + 1);
++ kmer.setByReadReverse(msgString.substring(0, index + 1).getBytes(), 0);
+ break;
+ case MessageFlag.DIR_RR:
+- selfString = GeneCode.reverseComplement(getVertexValue().getKmer().toString());
+- match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
+- msgString = GeneCode.reverseComplement(msg.getKmer().toString());
+- index = msgString.indexOf(match);
+- kmer.reset(msgString.length() - index);
+- kmer.setByRead(msgString.substring(index).getBytes(), 0);
++ selfString = getVertexValue().getKmer().toString();
++ match = selfString.substring(0,kmerSize - 1);
++ msgString = msg.getKmer().toString();
++ index = msgString.lastIndexOf(match) + kmerSize - 2;
++ kmer.reset(index + 1);
++ kmer.setByRead(msgString.substring(0, index + 1).getBytes(), 0);
++ System.out.println(kmer.toString());
+ break;
+ }
+
+diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/LogAlgorithmForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/LogAlgorithmForPathMergeVertex.java
+index a68b646..3b5a782 100644
+--- a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/LogAlgorithmForPathMergeVertex.java
++++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/pathmerge/LogAlgorithmForPathMergeVertex.java
+@@ -170,22 +170,22 @@ public class LogAlgorithmForPathMergeVertex extends
+ case MessageFromHead.BothMsgsFromHead:
+ case MessageFromHead.OneMsgFromOldHeadAndOneFromHead:
+ for(int i = 0; i < 2; i++)
+- processMerge(receivedMsgList.get(i));
++ processFinalMerge(receivedMsgList.get(i)); //processMerge()
+ getVertexValue().setState(State.IS_FINAL);
+ voteToHalt();
+ break;
+ case MessageFromHead.OneMsgFromHeadAndOneFromNonHead:
+ for(int i = 0; i < 2; i++)
+- processMerge(receivedMsgList.get(i));
++ processFinalMerge(receivedMsgList.get(i));
+ getVertexValue().setState(State .IS_HEAD);
+ break;
+ case MessageFromHead.BothMsgsFromNonHead:
+ for(int i = 0; i < 2; i++)
+- processMerge(receivedMsgList.get(i));
++ processFinalMerge(receivedMsgList.get(i));
+ break;
+ case MessageFromHead.NO_MSG:
+ //halt
+- deleteVertex(getVertexId());
++ voteToHalt(); //deleteVertex(getVertexId());
+ break;
+ }
+ }
+diff --git a/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/PathMergeSmallTestSuite.java b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/PathMergeSmallTestSuite.java
+index 9f96b5a..1578dfc 100644
+--- a/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/PathMergeSmallTestSuite.java
++++ b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/PathMergeSmallTestSuite.java
+@@ -52,7 +52,7 @@ public class PathMergeSmallTestSuite extends TestSuite {
+ // + "6", PreFix + File.separator
+ // + "7", PreFix + File.separator
+ // + "8", PreFix + File.separator
+- + "5"};
++ + "9"};
+ private static final String ACTUAL_RESULT_DIR = "data/actual/pathmerge";
+ private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf";
+ private static final String PATH_TO_CLUSTER_STORE = "src/test/resources/cluster/stores.properties";
+--
+1.7.11.1
+