use new genomix-data
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
index c3e53e8..284c2e7 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -25,596 +25,584 @@
import org.apache.hadoop.io.WritableComparator;
import edu.uci.ics.genomix.data.KmerUtil;
+import edu.uci.ics.genomix.data.Marshal;
import edu.uci.ics.genomix.oldtype.NodeWritable.DirectionFlag;
/**
- * Variable kmer length byteswritable
- * It was used to generate the graph in which phase the kmer length doesn't change.
- * Thus the kmerByteSize of bytes doesn't change either.
+ * Variable kmer length byteswritable It was used to generate the graph in which
+ * phase the kmer length doesn't change. Thus the kmerByteSize of bytes doesn't
+ * change either.
*/
-public class KmerBytesWritable extends BinaryComparable implements Serializable, WritableComparable<BinaryComparable> {
- /**
+public class KmerBytesWritable extends BinaryComparable implements
+ Serializable, WritableComparable<BinaryComparable> {
+
+ private static final long serialVersionUID = 1L;
+ protected static final byte[] EMPTY_BYTES = {};
+
+ protected int lettersInKmer;
+ protected int bytesUsed;
+ protected byte[] bytes;
+ protected int offset;
+
+ /**
+ * Initialize as empty kmer
+ */
+ public KmerBytesWritable() {
+ this(0, EMPTY_BYTES, 0);
+ }
+
+ /**
+ * Copy contents of kmer string
+ */
+ public KmerBytesWritable(String kmer) {
+ setAsCopy(kmer);
+ }
+
+ /**
+ * Set as reference to given data
+ */
+ public KmerBytesWritable(int k, byte[] storage, int offset) {
+ setAsReference(k, storage, offset);
+ }
+
+ /**
+ * Reserve space for k letters
+ */
+ public KmerBytesWritable(int k) {
+ if (k > 0) {
+ this.bytes = new byte[KmerUtil.getByteNumFromK(k)];
+ } else {
+ this.bytes = EMPTY_BYTES;
+ }
+ this.offset = 0;
+ setKmerLength(k);
+ }
+
+ /**
+ * copy kmer in other
+ *
+ * @param other
+ */
+ public KmerBytesWritable(KmerBytesWritable other) {
+ this(other.lettersInKmer);
+ setAsCopy(other);
+ }
+
+ /**
+ * Deep copy of the given kmer
+ *
+ * @param other
+ */
+ public void setAsCopy(KmerBytesWritable other) {
+ reset(other.lettersInKmer);
+ if (lettersInKmer > 0) {
+ System.arraycopy(other.bytes, other.offset, bytes, this.offset,
+ bytesUsed);
+ }
+ }
+
+ /**
+ * set from String kmer
+ */
+ public void setAsCopy(String kmer) {
+ setKmerLength(kmer.length());
+ bytes = kmer.getBytes();
+ offset = 0;
+ }
+
+ /**
+ * Deep copy of the given bytes data
+ *
+ * @param newData
+ * @param offset
+ */
+ public void setAsCopy(int k, byte[] newData, int offset) {
+ reset(k);
+ System.arraycopy(newData, offset, bytes, this.offset, k);
+ }
+
+ /**
+ * Reset array by kmerlength
+ *
+ * @param k
+ */
+ public void reset(int k) {
+ setKmerLength(k);
+ setSize(bytesUsed);
+ clearLeadBit();
+ }
+
+ /**
+ * Point this datablock to the given bytes array It works like the pointer
+ * to new datablock.
+ *
+ * @param newData
+ * @param offset
+ */
+ public void setAsReference(int k, byte[] newData, int offset) {
+ this.bytes = newData;
+ this.offset = offset;
+ // my java skills are lacking. In inherited classes with a header, this
+ // will use the header version...
+ // setKmerLength(k);
+ bytesUsed = KmerUtil.getByteNumFromK(k);
+ lettersInKmer = k;
+ if (newData.length - offset < bytesUsed) {
+ throw new IllegalArgumentException("Requested " + bytesUsed
+ + " bytes (k=" + k + ") but buffer has only "
+ + (newData.length - offset) + " bytes");
+ }
+ }
+
+ /**
+ * Ensures that there is space for at least `size` bytes of kmer (not
+ * including any header)
*
*/
- private static final long serialVersionUID = 1L;
- private static final byte[] EMPTY_BYTES = {};
+ protected void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity((size * 3 / 2));
+ }
+ this.bytesUsed = size;
+ }
- public int kmerByteSize;
- protected byte[] bytes;
- protected int offset;
- protected int kmerlength;
+ /**
+ * return the number of bytes in use for the kmer (not including any header)
+ */
+ protected int getCapacity() {
+ return bytes.length;
+ }
- public KmerBytesWritable() {
- this(0, EMPTY_BYTES, 0);
- }
+ /**
+ * shrinks/expands the storage area to allow new_cap bytes for the kmer (no
+ * header included)
+ */
+ protected void setCapacity(int new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < bytesUsed) {
+ bytesUsed = new_cap;
+ }
+ if (bytesUsed != 0) {
+ System.arraycopy(bytes, offset, new_data, 0, bytesUsed);
+ }
+ bytes = new_data;
+ offset = 0;
+ }
+ }
- public KmerBytesWritable(int k, byte[] storage, int offset) {
- setNewReference(k, storage, offset);
- }
+ /**
+ * Get one genecode (A|G|C|T) from the given kmer index e.g. Get the 4th
+ * gene of the kmer ACGTA will return T
+ *
+ * @param pos
+ * @return
+ */
+ public byte getGeneCodeAtPosition(int pos) {
+ if (pos >= lettersInKmer) {
+ throw new IllegalArgumentException("gene position out of bound");
+ }
+ return geneCodeAtPosition(pos);
+ }
- public KmerBytesWritable(int k, String kmer) {
- setNewReference(kmer.length(), kmer.getBytes(), 0);
- }
+ // unchecked version of above. Used when kmerlength is inaccurate
+ // (mid-merge)
+ private byte geneCodeAtPosition(int pos) {
+ int posByte = pos / 4;
+ int shift = (pos % 4) << 1;
+ return (byte) ((bytes[offset + bytesUsed - 1 - posByte] >> shift) & 0x3);
+ }
- /**
- * Initial Kmer space by kmerlength
- *
- * @param k
- * kmerlengthz
- */
- public KmerBytesWritable(int k) {
- this.kmerlength = k;
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- if (k > 0) {
- this.bytes = new byte[this.kmerByteSize];
- } else {
- this.bytes = EMPTY_BYTES;
- }
- this.offset = 0;
- }
+ public void setKmerLength(int k) {
+ this.bytesUsed = KmerUtil.getByteNumFromK(k);
+ this.lettersInKmer = k;
+ }
- public KmerBytesWritable(KmerBytesWritable right) {
- this(right.kmerlength);
- set(right);
- }
+ public int getKmerLength() {
+ return lettersInKmer;
+ }
- /**
- * Deep copy of the given kmer
- *
- * @param newData
- */
- public void set(KmerBytesWritable newData) {
- if (newData == null) {
- this.set(0, EMPTY_BYTES, 0);
- } else {
- this.set(newData.kmerlength, newData.bytes, newData.getOffset());
- }
- }
+ @Override
+ public byte[] getBytes() {
+ return bytes;
+ }
- /**
- * Deep copy of the given bytes data
- * It will not change the kmerlength
- *
- * @param newData
- * @param offset
- */
- public void set(byte[] newData, int offset) {
- if (kmerlength > 0) {
- System.arraycopy(newData, offset, bytes, this.offset, kmerByteSize);
- }
- }
+ public int getOffset() {
+ return offset;
+ }
- /**
- * Deep copy of the given data, and also set to new kmerlength
- *
- * @param k
- * : new kmer length
- * @param newData
- * : data storage
- * @param offset
- * : start offset
- */
- public void set(int k, byte[] newData, int offset) {
- reset(k);
- if (k > 0) {
- System.arraycopy(newData, offset, bytes, this.offset, kmerByteSize);
- }
- }
+ @Override
+ public int getLength() {
+ return bytesUsed;
+ }
- /**
- * Reset array by kmerlength
- *
- * @param k
- */
- public void reset(int k) {
- this.kmerlength = k;
- setSize(KmerUtil.getByteNumFromK(k));
- clearLeadBit();
- }
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param stringBytes
+ * : byte array from a _string_. Meaning there's no header
+ * @param start
+ */
+ public void setByRead(byte[] stringBytes, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = this.bytesUsed - 1;
+ for (int i = start; i < start + lettersInKmer && i < stringBytes.length; i++) {
+ byte code = GeneCode.getCodeFromSymbol(stringBytes[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[offset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[offset] = l;
+ }
+ }
- /**
- * Point this datablock to the given bytes array
- * It works like the pointer to new datablock.
- * kmerlength will not change
- *
- * @param newData
- * @param offset
- */
- public void setNewReference(byte[] newData, int offset) {
- this.bytes = newData;
- this.offset = offset;
- if (newData.length - offset < kmerByteSize) {
- throw new IllegalArgumentException("Not given enough space");
- }
- }
+ public void setByRead(int k, byte[] array, int start) {
+ reset(k);
+ setByRead(array, start);
+ }
- /**
- * Point this datablock to the given bytes array
- * It works like the pointer to new datablock.
- * It also set the new kmerlength
- *
- * @param k
- * @param newData
- * @param offset
- */
- public void setNewReference(int k, byte[] newData, int offset) {
- this.kmerlength = k;
- this.kmerByteSize = KmerUtil.getByteNumFromK(k);
- setNewReference(newData, offset);
- }
+ /**
+ * Compress Reversed read into bytes array e.g. AATAG will paired to CTATT,
+ * and then compress as [0x000T,0xTATC]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = bytesUsed - 1;
+ // for (int i = start + kmerlength - 1; i >= 0 && i < array.length; i--)
+ // {
+ for (int i = start + lettersInKmer - 1; i >= start && i < array.length; i--) {
+ byte code = GeneCode.getPairedCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[offset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[offset] = l;
+ }
+ }
- protected void setSize(int size) {
- if (size > getCapacity()) {
- setCapacity((size * 3 / 2));
- }
- this.kmerByteSize = size;
- }
+ public void setByReadReverse(int k, byte[] array, int start) {
+ reset(k);
+ setByReadReverse(array, start);
+ }
- protected int getCapacity() {
- return bytes.length;
- }
+ /**
+ * Shift Kmer to accept new char input
+ *
+ * @param c
+ * Input new gene character
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextChar(byte c) {
+ return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
+ }
- protected void setCapacity(int new_cap) {
- if (new_cap != getCapacity()) {
- byte[] new_data = new byte[new_cap];
- if (new_cap < kmerByteSize) {
- kmerByteSize = new_cap;
- }
- if (kmerByteSize != 0) {
- System.arraycopy(bytes, offset, new_data, 0, kmerByteSize);
- }
- bytes = new_data;
- offset = 0;
- }
- }
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextCode(byte c) {
+ byte output = (byte) (bytes[offset + bytesUsed - 1] & 0x03);
+ for (int i = bytesUsed - 1; i > 0; i--) {
+ byte in = (byte) (bytes[offset + i - 1] & 0x03);
+ bytes[offset + i] = (byte) (((bytes[offset + i] >>> 2) & 0x3f) | (in << 6));
+ }
+ int pos = ((lettersInKmer - 1) % 4) << 1;
+ byte code = (byte) (c << pos);
+ bytes[offset] = (byte) (((bytes[offset] >>> 2) & 0x3f) | code);
+ clearLeadBit();
+ return output;
+ }
- /**
- * Get one genecode (A|G|C|T) from the given kmer index
- * e.g. Get the 4th gene of the kmer ACGTA will return T
- *
- * @param pos
- * @return
- */
- public byte getGeneCodeAtPosition(int pos) {
- if (pos >= kmerlength) {
- throw new IllegalArgumentException("gene position out of bound");
- }
- return geneCodeAtPosition(pos);
- }
-
- // unchecked version of above. Used when kmerlength is inaccurate (mid-merge)
- private byte geneCodeAtPosition(int pos) {
- int posByte = pos / 4;
- int shift = (pos % 4) << 1;
- return (byte) ((bytes[offset + kmerByteSize - 1 - posByte] >> shift) & 0x3);
- }
-
- public int getKmerLength() {
- return this.kmerlength;
- }
+ /**
+ * Shift Kmer to accept new input char
+ *
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreChar(byte c) {
+ return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
+ }
- @Override
- public byte[] getBytes() {
- return bytes;
- }
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreCode(byte c) {
+ int pos = ((lettersInKmer - 1) % 4) << 1;
+ byte output = (byte) ((bytes[offset] >> pos) & 0x03);
+ for (int i = 0; i < bytesUsed - 1; i++) {
+ byte in = (byte) ((bytes[offset + i + 1] >> 6) & 0x03);
+ bytes[offset + i] = (byte) ((bytes[offset + i] << 2) | in);
+ }
+ bytes[offset + bytesUsed - 1] = (byte) ((bytes[offset + bytesUsed - 1] << 2) | c);
+ clearLeadBit();
+ return output;
+ }
- public int getOffset() {
- return offset;
- }
+ /**
+ * Merge Kmer with the next connected Kmer e.g. AAGCTAA merge with AACAACC,
+ * if the initial kmerSize = 3 then it will return AAGCTAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param kmer
+ * : the next kmer
+ */
+ public void mergeWithFFKmer(int initialKmerSize, KmerBytesWritable kmer) {
+ int preKmerLength = lettersInKmer;
+ int preSize = bytesUsed;
+ lettersInKmer += kmer.lettersInKmer - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(lettersInKmer));
+ for (int i = 1; i <= preSize; i++) {
+ bytes[offset + bytesUsed - i] = bytes[offset + preSize - i];
+ }
+ for (int k = initialKmerSize - 1; k < kmer.getKmerLength(); k += 4) {
+ byte onebyte = getOneByteFromKmerAtPosition(k, kmer.getBytes(),
+ kmer.getOffset(), kmer.getLength());
+ appendOneByteAtPosition(preKmerLength + k - initialKmerSize + 1,
+ onebyte, bytes, offset, bytesUsed);
+ }
+ clearLeadBit();
+ }
- @Override
- public int getLength() {
- return kmerByteSize;
- }
+ /**
+ * Merge Kmer with the next connected Kmer, when that Kmer needs to be
+ * reverse-complemented e.g. AAGCTAA merge with GGTTGTT, if the initial
+ * kmerSize = 3 then it will return AAGCTAACAACC A merge B => A B~
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param kmer
+ * : the next kmer
+ */
+ public void mergeWithFRKmer(int initialKmerSize, KmerBytesWritable kmer) {
+ int preSize = bytesUsed;
+ int preKmerLength = lettersInKmer;
+ lettersInKmer += kmer.lettersInKmer - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(lettersInKmer));
+ // copy prefix into right-side of buffer
+ for (int i = 1; i <= preSize; i++) {
+ bytes[offset + bytesUsed - i] = bytes[offset + preSize - i];
+ }
- /**
- * Read Kmer from read text into bytes array e.g. AATAG will compress as
- * [0x000G, 0xATAA]
- *
- * @param k
- * @param array
- * @param start
- */
- public void setByRead(byte[] array, int start) {
- byte l = 0;
- int bytecount = 0;
- int bcount = this.kmerByteSize - 1;
- for (int i = start; i < start + kmerlength && i < array.length; i++) {
- byte code = GeneCode.getCodeFromSymbol(array[i]);
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[offset + bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[offset] = l;
- }
- }
+ int bytecount = (preKmerLength % 4) * 2;
+ int bcount = bytesUsed - preSize - bytecount / 8; // may overlap
+ // previous kmer
+ byte l = bcount == bytesUsed - preSize ? bytes[offset + bcount] : 0x00;
+ bytecount %= 8;
+ for (int i = kmer.lettersInKmer - initialKmerSize; i >= 0; i--) {
+ byte code = GeneCode.getPairedGeneCode(kmer
+ .getGeneCodeAtPosition(i));
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[offset + bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[offset] = l;
+ }
+ }
- public void setByRead(int k, byte[] array, int start) {
- reset(k);
- setByRead(array, start);
- }
+ /**
+ * Merge Kmer with the previous connected Kmer, when that kmer needs to be
+ * reverse-complemented e.g. AACAACC merge with TTCTGCC, if the initial
+ * kmerSize = 3 then it will return GGCAGAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param preKmer
+ * : the previous kmer
+ */
+ public void mergeWithRFKmer(int initialKmerSize, KmerBytesWritable preKmer) {
+ KmerBytesWritable reversed = new KmerBytesWritable(preKmer.lettersInKmer);
+ reversed.setByReadReverse(preKmer.toString().getBytes(), 0);
+ mergeWithRRKmer(initialKmerSize, reversed);
+ }
- /**
- * Compress Reversed read into bytes array
- * e.g. AATAG will paired to CTATT, and then compress as
- * [0x000T,0xTATC]
- *
- * @param input
- * array
- * @param start
- * position
- */
- public void setByReadReverse(byte[] array, int start) {
- byte l = 0;
- int bytecount = 0;
- int bcount = kmerByteSize - 1;
-// for (int i = start + kmerlength - 1; i >= 0 && i < array.length; i--) {
- for (int i = start + kmerlength - 1; i >= start && i < array.length; i--) {
- byte code = GeneCode.getPairedCodeFromSymbol(array[i]);
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[offset + bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[offset] = l;
- }
- }
+ /**
+ * Merge Kmer with the previous connected Kmer e.g. AACAACC merge with
+ * AAGCTAA, if the initial kmerSize = 3 then it will return AAGCTAACAACC
+ *
+ * @param initialKmerSize
+ * : the initial kmerSize
+ * @param preKmer
+ * : the previous kmer
+ */
+ public void mergeWithRRKmer(int initialKmerSize, KmerBytesWritable preKmer) {
+ int preKmerLength = lettersInKmer;
+ int preSize = bytesUsed;
+ lettersInKmer += preKmer.lettersInKmer - initialKmerSize + 1;
+ setSize(KmerUtil.getByteNumFromK(lettersInKmer));
+ byte cacheByte = getOneByteFromKmerAtPosition(0, bytes, offset, preSize);
- public void setByReadReverse(int k, byte[] array, int start) {
- reset(k);
- setByReadReverse(array, start);
- }
+ // copy prekmer
+ for (int k = 0; k < preKmer.lettersInKmer - initialKmerSize + 1; k += 4) {
+ byte onebyte = getOneByteFromKmerAtPosition(k, preKmer.bytes,
+ preKmer.offset, preKmer.bytesUsed);
+ appendOneByteAtPosition(k, onebyte, bytes, offset, bytesUsed);
+ }
- /**
- * Shift Kmer to accept new char input
- *
- * @param c
- * Input new gene character
- * @return the shift out gene, in gene code format
- */
- public byte shiftKmerWithNextChar(byte c) {
- return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
- }
+ // copy current kmer
+ int k = 4;
+ for (; k < preKmerLength; k += 4) {
+ byte onebyte = getOneByteFromKmerAtPosition(k, bytes, offset,
+ preSize);
+ appendOneByteAtPosition(preKmer.lettersInKmer - initialKmerSize + k
+ - 4 + 1, cacheByte, bytes, offset, bytesUsed);
+ cacheByte = onebyte;
+ }
+ appendOneByteAtPosition(preKmer.lettersInKmer - initialKmerSize + k - 4
+ + 1, cacheByte, bytes, offset, bytesUsed);
+ clearLeadBit();
+ }
- /**
- * Shift Kmer to accept new gene code
- *
- * @param c
- * Input new gene code
- * @return the shift out gene, in gene code format
- */
- public byte shiftKmerWithNextCode(byte c) {
- byte output = (byte) (bytes[offset + kmerByteSize - 1] & 0x03);
- for (int i = kmerByteSize - 1; i > 0; i--) {
- byte in = (byte) (bytes[offset + i - 1] & 0x03);
- bytes[offset + i] = (byte) (((bytes[offset + i] >>> 2) & 0x3f) | (in << 6));
- }
- int pos = ((kmerlength - 1) % 4) << 1;
- byte code = (byte) (c << pos);
- bytes[offset] = (byte) (((bytes[offset] >>> 2) & 0x3f) | code);
- clearLeadBit();
- return output;
- }
+ public void mergeWithKmerInDir(byte dir, int initialKmerSize,
+ KmerBytesWritable kmer) {
+ switch (dir & DirectionFlag.DIR_MASK) {
+ case DirectionFlag.DIR_FF:
+ mergeWithFFKmer(initialKmerSize, kmer);
+ break;
+ case DirectionFlag.DIR_FR:
+ mergeWithFRKmer(initialKmerSize, kmer);
+ break;
+ case DirectionFlag.DIR_RF:
+ mergeWithRFKmer(initialKmerSize, kmer);
+ break;
+ case DirectionFlag.DIR_RR:
+ mergeWithRRKmer(initialKmerSize, kmer);
+ break;
+ default:
+ throw new RuntimeException("Direction not recognized: " + dir);
+ }
+ }
- /**
- * Shift Kmer to accept new input char
- *
- * @param c
- * Input new gene character
- * @return the shiftout gene, in gene code format
- */
- public byte shiftKmerWithPreChar(byte c) {
- return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
- }
+ public static void appendOneByteAtPosition(int k, byte onebyte,
+ byte[] buffer, int start, int length) {
+ int position = start + length - 1 - k / 4;
+ if (position < start) {
+ throw new IllegalArgumentException(
+ "Buffer for kmer storage is invalid");
+ }
+ int shift = ((k) % 4) << 1;
+ int mask = shift == 0 ? 0 : ((1 << shift) - 1);
- /**
- * Shift Kmer to accept new gene code
- *
- * @param c
- * Input new gene code
- * @return the shiftout gene, in gene code format
- */
- public byte shiftKmerWithPreCode(byte c) {
- int pos = ((kmerlength - 1) % 4) << 1;
- byte output = (byte) ((bytes[offset] >> pos) & 0x03);
- for (int i = 0; i < kmerByteSize - 1; i++) {
- byte in = (byte) ((bytes[offset + i + 1] >> 6) & 0x03);
- bytes[offset + i] = (byte) ((bytes[offset + i] << 2) | in);
- }
- bytes[offset + kmerByteSize - 1] = (byte) ((bytes[offset + kmerByteSize - 1] << 2) | c);
- clearLeadBit();
- return output;
- }
+ buffer[position] = (byte) ((buffer[position] & mask) | ((0xff & onebyte) << shift));
+ if (position > start && shift != 0) {
+ buffer[position - 1] = (byte) ((buffer[position - 1] & (0xff - mask)) | ((byte) ((0xff & onebyte) >>> (8 - shift))));
+ }
+ }
- /**
- * Merge Kmer with the next connected Kmer
- * e.g. AAGCTAA merge with AACAACC, if the initial kmerSize = 3
- * then it will return AAGCTAACAACC
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param kmer
- * : the next kmer
- */
- public void mergeWithFFKmer(int initialKmerSize, KmerBytesWritable kmer) {
- int preKmerLength = kmerlength;
- int preSize = kmerByteSize;
- this.kmerlength += kmer.kmerlength - initialKmerSize + 1;
- setSize(KmerUtil.getByteNumFromK(kmerlength));
- for (int i = 1; i <= preSize; i++) {
- bytes[offset + kmerByteSize - i] = bytes[offset + preSize - i];
- }
- for (int k = initialKmerSize - 1; k < kmer.getKmerLength(); k += 4) {
- byte onebyte = getOneByteFromKmerAtPosition(k, kmer.getBytes(), kmer.getOffset(), kmer.getLength());
- appendOneByteAtPosition(preKmerLength + k - initialKmerSize + 1, onebyte, bytes, offset, kmerByteSize);
- }
- clearLeadBit();
- }
+ public static byte getOneByteFromKmerAtPosition(int k, byte[] buffer,
+ int start, int length) {
+ int position = start + length - 1 - k / 4;
+ if (position < start) {
+ throw new IllegalArgumentException(
+ "Buffer of kmer storage is invalid");
+ }
+ int shift = (k % 4) << 1;
+ byte data = (byte) (((0xff) & buffer[position]) >>> shift);
+ if (shift != 0 && position > start) {
+ data |= 0xff & (buffer[position - 1] << (8 - shift));
+ }
+ return data;
+ }
- /**
- * Merge Kmer with the next connected Kmer, when that Kmer needs to be reverse-complemented
- * e.g. AAGCTAA merge with GGTTGTT, if the initial kmerSize = 3
- * then it will return AAGCTAACAACC
- *
- * A merge B => A B~
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param kmer
- * : the next kmer
- */
- public void mergeWithFRKmer(int initialKmerSize, KmerBytesWritable kmer) {
- int preSize = kmerByteSize;
- int preKmerLength = kmerlength;
- this.kmerlength += kmer.kmerlength - initialKmerSize + 1;
- setSize(KmerUtil.getByteNumFromK(kmerlength));
- // copy prefix into right-side of buffer
- for (int i = 1; i <= preSize; i++) {
- bytes[offset + kmerByteSize - i] = bytes[offset + preSize - i];
- }
+ protected void clearLeadBit() {
+ if (lettersInKmer % 4 != 0) {
+ bytes[offset] &= (1 << ((lettersInKmer % 4) << 1)) - 1;
+ }
+ }
- int bytecount = (preKmerLength % 4) * 2;
- int bcount = kmerByteSize - preSize - bytecount / 8; // may overlap previous kmer
- byte l = bcount == kmerByteSize - preSize ? bytes[offset + bcount] : 0x00;
- bytecount %= 8;
- for (int i = kmer.kmerlength - initialKmerSize; i >= 0; i--) {
- byte code = GeneCode.getPairedGeneCode(kmer.getGeneCodeAtPosition(i));
- l |= (byte) (code << bytecount);
- bytecount += 2;
- if (bytecount == 8) {
- bytes[offset + bcount--] = l;
- l = 0;
- bytecount = 0;
- }
- }
- if (bcount >= 0) {
- bytes[offset] = l;
- }
- }
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ lettersInKmer = in.readInt();
+ bytesUsed = KmerUtil.getByteNumFromK(lettersInKmer);
+ if (lettersInKmer > 0) {
+ if (this.bytes.length < this.bytesUsed) {
+ this.bytes = new byte[this.bytesUsed];
+ this.offset = 0;
- /**
- * Merge Kmer with the previous connected Kmer, when that kmer needs to be reverse-complemented
- * e.g. AACAACC merge with TTCTGCC, if the initial kmerSize = 3
- * then it will return GGCAGAACAACC
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param preKmer
- * : the previous kmer
- */
- public void mergeWithRFKmer(int initialKmerSize, KmerBytesWritable preKmer) {
- int preKmerLength = kmerlength;
- int preSize = kmerByteSize;
- this.kmerlength += preKmer.kmerlength - initialKmerSize + 1;
- setSize(KmerUtil.getByteNumFromK(kmerlength));
- // byte cacheByte = getOneByteFromKmerAtPosition(0, bytes, offset, preSize);
+ }
+ in.readFully(bytes, offset, bytesUsed);
+ }
+ }
- int byteIndex = kmerByteSize - 1;
- byte cacheByte = 0x00;
- int posnInByte = 0;
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(lettersInKmer);
+ if (lettersInKmer > 0) {
+ out.write(bytes, offset, bytesUsed);
+ }
+ }
- // copy rc of preKmer into high bytes
- for (int i = preKmer.kmerlength - 1; i >= initialKmerSize - 1; i--) {
- byte code = GeneCode.getPairedGeneCode(preKmer.getGeneCodeAtPosition(i));
- cacheByte |= (byte) (code << posnInByte);
- posnInByte += 2;
- if (posnInByte == 8) {
- bytes[byteIndex--] = cacheByte;
- cacheByte = 0;
- posnInByte = 0;
- }
- }
-
- // copy my kmer into low positions of bytes
- for (int i = 0; i < preKmerLength; i++) {
- // expanding the capacity makes this offset incorrect. It's off by the # of additional bytes added.
- int newposn = i + (kmerByteSize - preSize) * 4;
- byte code = geneCodeAtPosition(newposn);
- cacheByte |= (byte) (code << posnInByte);
- posnInByte += 2;
- if (posnInByte == 8) {
- bytes[byteIndex--] = cacheByte;
- cacheByte = 0;
- posnInByte = 0;
- }
- }
- if(posnInByte > 0)
- bytes[offset] = cacheByte;
- clearLeadBit();
- }
+ @Override
+ public int hashCode() {
+ return super.hashCode() * 31 + this.lettersInKmer;
+ }
- /**
- * Merge Kmer with the previous connected Kmer
- * e.g. AACAACC merge with AAGCTAA, if the initial kmerSize = 3
- * then it will return AAGCTAACAACC
- *
- * @param initialKmerSize
- * : the initial kmerSize
- * @param preKmer
- * : the previous kmer
- */
- public void mergeWithRRKmer(int initialKmerSize, KmerBytesWritable preKmer) {
- int preKmerLength = kmerlength;
- int preSize = kmerByteSize;
- this.kmerlength += preKmer.kmerlength - initialKmerSize + 1;
- setSize(KmerUtil.getByteNumFromK(kmerlength));
- byte cacheByte = getOneByteFromKmerAtPosition(0, bytes, offset, preSize);
+ @Override
+ public boolean equals(Object right_obj) {
+ if (right_obj instanceof KmerBytesWritable)
+ return this.lettersInKmer == ((KmerBytesWritable) right_obj).lettersInKmer
+ && super.equals(right_obj);
+ return false;
+ }
- // copy prekmer
- for (int k = 0; k < preKmer.kmerlength - initialKmerSize + 1; k += 4) {
- byte onebyte = getOneByteFromKmerAtPosition(k, preKmer.bytes, preKmer.offset, preKmer.kmerByteSize);
- appendOneByteAtPosition(k, onebyte, bytes, offset, kmerByteSize);
- }
+ @Override
+ public String toString() {
+ return KmerUtil.recoverKmerFrom(this.lettersInKmer, this.getBytes(),
+ offset, this.getLength());
+ }
- // copy current kmer
- int k = 4;
- for (; k < preKmerLength; k += 4) {
- byte onebyte = getOneByteFromKmerAtPosition(k, bytes, offset, preSize);
- appendOneByteAtPosition(preKmer.kmerlength - initialKmerSize + k - 4 + 1, cacheByte, bytes, offset, kmerByteSize);
- cacheByte = onebyte;
- }
- appendOneByteAtPosition(preKmer.kmerlength - initialKmerSize + k - 4 + 1, cacheByte, bytes, offset, kmerByteSize);
- clearLeadBit();
- }
-
- public void mergeWithKmerInDir(byte dir, int initialKmerSize, KmerBytesWritable kmer) {
- switch(dir & DirectionFlag.DIR_MASK) {
- case DirectionFlag.DIR_FF:
- mergeWithFFKmer(initialKmerSize, kmer);
- break;
- case DirectionFlag.DIR_FR:
- mergeWithFRKmer(initialKmerSize, kmer);
- break;
- case DirectionFlag.DIR_RF:
- mergeWithRFKmer(initialKmerSize, kmer);
- break;
- case DirectionFlag.DIR_RR:
- mergeWithRRKmer(initialKmerSize, kmer);
- break;
- default:
- throw new RuntimeException("Direciotn not recognized: " + dir);
- }
- }
+ public static class Comparator extends WritableComparator {
+ private static final int LEADING_BYTES = 4;
- public static void appendOneByteAtPosition(int k, byte onebyte, byte[] buffer, int start, int length) {
- int position = start + length - 1 - k / 4;
- if (position < start) {
- throw new IllegalArgumentException("Buffer for kmer storage is invalid");
- }
- int shift = ((k) % 4) << 1;
- int mask = shift == 0 ? 0 : ((1 << shift) - 1);
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
- buffer[position] = (byte) ((buffer[position] & mask) | ((0xff & onebyte) << shift));
- if (position > start && shift != 0) {
- buffer[position - 1] = (byte) ((buffer[position - 1] & (0xff - mask)) | ((byte) ((0xff & onebyte) >>> (8 - shift))));
- }
- }
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = Marshal.getInt(b1, s1);
+ int kmerlength2 = Marshal.getInt(b2, s2);
+ if (kmerlength1 == kmerlength2) {
+ return compareBytes(b1, s1 + LEADING_BYTES, l1 - LEADING_BYTES,
+ b2, s2 + LEADING_BYTES, l2 - LEADING_BYTES);
+ }
+ return kmerlength1 - kmerlength2;
+ }
+ }
- public static byte getOneByteFromKmerAtPosition(int k, byte[] buffer, int start, int length) {
- int position = start + length - 1 - k / 4;
- if (position < start) {
- throw new IllegalArgumentException("Buffer of kmer storage is invalid");
- }
- int shift = (k % 4) << 1;
- byte data = (byte) (((0xff) & buffer[position]) >>> shift);
- if (shift != 0 && position > start) {
- data |= 0xff & (buffer[position - 1] << (8 - shift));
- }
- return data;
- }
-
- protected void clearLeadBit() {
- if (kmerlength % 4 != 0) {
- bytes[offset] &= (1 << ((kmerlength % 4) << 1)) - 1;
- }
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- this.kmerlength = in.readInt();
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- if (this.kmerlength > 0) {
- if (this.bytes.length < this.kmerByteSize) {
- this.bytes = new byte[this.kmerByteSize];
- this.offset = 0;
- }
- in.readFully(bytes, offset, kmerByteSize);
- }
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeInt(kmerlength);
- if (kmerlength > 0) {
- out.write(bytes, offset, kmerByteSize);
- }
- }
-
- @Override
- public int hashCode() {
- return super.hashCode() * 31 + this.kmerlength;
- }
-
- @Override
- public boolean equals(Object right_obj) {
- if (right_obj instanceof KmerBytesWritable)
- return this.kmerlength == ((KmerBytesWritable) right_obj).kmerlength && super.equals(right_obj);
- return false;
- }
-
- @Override
- public String toString() {
- return KmerUtil.recoverKmerFrom(this.kmerlength, this.getBytes(), offset, this.getLength());
- }
-
- public static class Comparator extends WritableComparator {
- public final int LEAD_BYTES = 4;
-
- public Comparator() {
- super(KmerBytesWritable.class);
- }
-
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- int kmerlength1 = readInt(b1, s1);
- int kmerlength2 = readInt(b2, s2);
- if (kmerlength1 == kmerlength2) {
- return compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
- }
- return kmerlength1 - kmerlength2;
- }
- }
-
- static { // register this comparator
- WritableComparator.define(KmerBytesWritable.class, new Comparator());
- }
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
index d2e3a94..f805610 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritableFactory.java
@@ -64,7 +64,7 @@
return null;
}
if (lastK == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
+ kmer.setAsCopy(kmerChain);
return kmer;
}
kmer.reset(lastK);
@@ -100,7 +100,7 @@
return null;
}
if (firstK == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
+ kmer.setAsCopy(kmerChain);
return kmer;
}
kmer.reset(firstK);
@@ -124,7 +124,7 @@
return null;
}
if (startK == 0 && kSize == kmerChain.getKmerLength()) {
- kmer.set(kmerChain);
+ kmer.setAsCopy(kmerChain);
return kmer;
}
kmer.reset(kSize);
@@ -256,7 +256,7 @@
* @return new created kmer that shifted by afterCode, the K will not change
*/
public KmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode) {
- this.kmer.set(kmer);
+ this.kmer.setAsCopy(kmer);
this.kmer.shiftKmerWithNextCode(afterCode);
return this.kmer;
}
@@ -274,7 +274,7 @@
* @return new created kmer that shifted by preCode, the K will not change
*/
public KmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
- this.kmer.set(kmer);
+ this.kmer.setAsCopy(kmer);
this.kmer.shiftKmerWithPreCode(preCode);
return this.kmer;
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java
index 88bb79c..6bf8dac 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerListWritable.java
@@ -4,6 +4,7 @@
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@@ -12,224 +13,256 @@
import edu.uci.ics.genomix.data.KmerUtil;
import edu.uci.ics.genomix.data.Marshal;
-public class KmerListWritable implements Writable, Iterable<KmerBytesWritable>, Serializable{
- private static final long serialVersionUID = 1L;
- protected byte[] storage;
- protected int offset;
- protected int valueCount;
- public int kmerByteSize = 0;
- public int kmerlength = 0;
- protected static final byte[] EMPTY = {};
-
- protected KmerBytesWritable posIter = new KmerBytesWritable();
-
- public KmerListWritable() {
- this.storage = EMPTY;
- this.valueCount = 0;
- this.offset = 0;
- }
-
- public KmerListWritable(int kmerlength) {
- this();
- this.kmerlength = kmerlength;
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- }
-
- public KmerListWritable(int kmerlength, int count, byte[] data, int offset) {
- this.kmerlength = kmerlength;
- this.kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- setNewReference(count, data, offset);
- }
-
- public KmerListWritable(List<KmerBytesWritable> kmers) {
- this();
- setSize(kmers.size()); // reserve space for all elements
- for (KmerBytesWritable kmer : kmers) {
- append(kmer);
- }
- }
-
- public void setNewReference(int count, byte[] data, int offset) {
- this.valueCount = count;
- this.storage = data;
- this.offset = offset;
- }
-
- public void append(KmerBytesWritable kmer){
- if(kmer != null){
- kmerByteSize = kmer.kmerByteSize;
- kmerlength = kmer.kmerlength;
- setSize((1 + valueCount) * kmerByteSize);
- System.arraycopy(kmer.getBytes(), 0, storage, offset + valueCount * kmerByteSize, kmerByteSize);
- valueCount += 1;
- }
- }
-
- /*
- * Append the otherList to the end of myList
- */
- public void appendList(KmerListWritable otherList) {
- if (otherList.valueCount > 0) {
- setSize((valueCount + otherList.valueCount) * kmerByteSize);
- // copy contents of otherList into the end of my storage
- System.arraycopy(otherList.storage, otherList.offset,
- storage, offset + valueCount * kmerByteSize,
- otherList.valueCount * kmerByteSize);
- valueCount += otherList.valueCount;
- }
- }
-
- protected void setSize(int size) {
- if (size > getCapacity()) {
- setCapacity((size * 3 / 2));
- }
- }
-
- protected int getCapacity() {
- return storage.length - offset;
- }
- protected void setCapacity(int new_cap) {
- if (new_cap > getCapacity()) {
- byte[] new_data = new byte[new_cap];
- if (storage.length - offset > 0) {
- System.arraycopy(storage, offset, new_data, 0, storage.length - offset);
- }
- storage = new_data;
- offset = 0;
- }
- }
-
- public void reset() {
- this.reset(0);
- }
-
- public void reset(int kmerSize) {
- kmerlength = kmerSize;
- kmerByteSize = KmerUtil.getByteNumFromK(kmerlength);
- storage = EMPTY;
- valueCount = 0;
- offset = 0;
- }
-
- public KmerBytesWritable getPosition(int i) {
- if (i >= valueCount) {
- throw new ArrayIndexOutOfBoundsException("No such positions");
- }
- posIter.setNewReference(kmerlength, storage, offset + i * kmerByteSize);
- return posIter;
- }
-
- public void set(KmerListWritable otherList) {
- this.kmerlength = otherList.kmerlength;
- this.kmerByteSize = otherList.kmerByteSize;
- set(otherList.valueCount, otherList.storage, otherList.offset);
- }
+/**
+ * A list of fixed-length kmers. The length of this list is stored internally
+ *
+ */
+public class KmerListWritable implements Writable, Iterable<KmerBytesWritable>,
+ Serializable {
+ private static final long serialVersionUID = 1L;
+ protected static final byte[] EMPTY_BYTES = { 0, 0, 0, 0 };
+ protected static final int HEADER_SIZE = 4;
- public void set(int valueCount, byte[] newData, int offset) {
- this.valueCount = valueCount;
- setSize(valueCount * kmerByteSize);
- if (valueCount > 0) {
- System.arraycopy(newData, offset, storage, this.offset, valueCount * kmerByteSize);
- }
- }
-
- @Override
- public Iterator<KmerBytesWritable> iterator() {
- Iterator<KmerBytesWritable> it = new Iterator<KmerBytesWritable>() {
+ protected byte[] storage;
+ protected int offset;
+ protected int valueCount;
- private int currentIndex = 0;
+ protected int bytesPerKmer = 0;
+ protected int lettersPerKmer = 0;
+ private KmerBytesWritable posIter = new KmerBytesWritable();
- @Override
- public boolean hasNext() {
- return currentIndex < valueCount;
- }
+ public KmerListWritable() {
+ this.storage = EMPTY_BYTES;
+ this.valueCount = 0;
+ this.offset = 0;
+ }
- @Override
- public KmerBytesWritable next() {
- return getPosition(currentIndex++);
- }
+ public KmerListWritable(int kmerlength) {
+ this();
+ this.lettersPerKmer = kmerlength;
+ this.bytesPerKmer = KmerUtil.getByteNumFromK(kmerlength);
+ }
- @Override
- public void remove() {
- if(currentIndex < valueCount)
- System.arraycopy(storage, offset + currentIndex * kmerByteSize,
- storage, offset + (currentIndex - 1) * kmerByteSize,
- (valueCount - currentIndex) * kmerByteSize);
- valueCount--;
- currentIndex--;
- }
- };
- return it;
- }
-
- /*
- * remove the first instance of @toRemove. Uses a linear scan. Throws an exception if not in this list.
- */
- public void remove(KmerBytesWritable toRemove, boolean ignoreMissing) {
- Iterator<KmerBytesWritable> posIterator = this.iterator();
- while (posIterator.hasNext()) {
- if(toRemove.equals(posIterator.next())) {
- posIterator.remove();
- return;
- }
- }
- if (!ignoreMissing) {
- throw new ArrayIndexOutOfBoundsException("the KmerBytesWritable `" + toRemove.toString() + "` was not found in this list.");
- }
- }
-
- public void remove(KmerBytesWritable toRemove) {
- remove(toRemove, false);
- }
+ public KmerListWritable(int kmerlength, byte[] data, int offset) {
+ this.lettersPerKmer = kmerlength;
+ this.bytesPerKmer = KmerUtil.getByteNumFromK(kmerlength);
+ setNewReference(data, offset);
+ }
- @Override
- public void readFields(DataInput in) throws IOException {
- this.valueCount = in.readInt();
- setSize(valueCount * kmerByteSize);//kmerByteSize
- in.readFully(storage, offset, valueCount * kmerByteSize);//kmerByteSize
- }
+ public KmerListWritable(List<KmerBytesWritable> kmers) {
+ this();
+ setSize(kmers.size()); // reserve space for all elements
+ for (KmerBytesWritable kmer : kmers) {
+ if (kmer.getKmerLength() != lettersPerKmer)
+ throw new IllegalArgumentException("Kmer " + kmer.toString()
+ + " is of incorrect length (l=" + kmer.getKmerLength()
+ + ") for this list (should be " + lettersPerKmer + ").");
+ append(kmer);
+ }
+ }
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeInt(valueCount);
- out.write(storage, offset, valueCount * kmerByteSize);
- }
-
- public int getCountOfPosition() {
- return valueCount;
- }
+ public void setNewReference(byte[] data, int offset) {
+ this.valueCount = Marshal.getInt(data, offset);
+ this.storage = data;
+ this.offset = offset;
+ }
- public byte[] getByteArray() {
- return storage;
- }
+ public void append(KmerBytesWritable kmer) {
+ setSize((1 + valueCount) * bytesPerKmer);
+ System.arraycopy(kmer.getBytes(), 0, storage, offset + valueCount
+ * bytesPerKmer, bytesPerKmer);
+ valueCount += 1;
+ }
- public int getStartOffset() {
- return offset;
- }
-
- public int getLength() {
- return valueCount * kmerByteSize;
- }
-
- @Override
- public String toString() {
- StringBuilder sbuilder = new StringBuilder();
- sbuilder.append('[');
- for(int i = 0; i < valueCount; i++){
- sbuilder.append(getPosition(i).toString());
- sbuilder.append(',');
- }
- if (valueCount > 0) {
- sbuilder.setCharAt(sbuilder.length() - 1, ']');
- } else {
- sbuilder.append(']');
- }
- return sbuilder.toString();
- }
-
- @Override
- public int hashCode() {
- return Marshal.hashBytes(getByteArray(), getStartOffset(), getLength());
- }
+ /*
+ * Append the otherList to the end of myList
+ */
+ public void appendList(KmerListWritable otherList) {
+ if (otherList.valueCount > 0) {
+ setSize((valueCount + otherList.valueCount) * bytesPerKmer);
+ // copy contents of otherList into the end of my storage
+ System.arraycopy(otherList.storage, otherList.offset, storage,
+ offset + valueCount * bytesPerKmer, otherList.valueCount
+ * bytesPerKmer);
+ valueCount += otherList.valueCount;
+ }
+ }
+
+ /**
+ * Save the union of my list and otherList. Uses a temporary HashSet for
+ * uniquefication
+ */
+ public void unionUpdate(KmerListWritable otherList) {
+ int newSize = valueCount + otherList.valueCount;
+ HashSet<KmerBytesWritable> uniqueElements = new HashSet<KmerBytesWritable>(
+ newSize);
+ for (KmerBytesWritable kmer : this) {
+ uniqueElements.add(kmer);
+ }
+ for (KmerBytesWritable kmer : otherList) {
+ uniqueElements.add(kmer);
+ }
+ valueCount = 0;
+ setSize(newSize);
+ for (KmerBytesWritable kmer : uniqueElements) {
+ append(kmer);
+ }
+ }
+
+ protected void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity((size * 3 / 2));
+ }
+ }
+
+ protected int getCapacity() {
+ return storage.length - offset;
+ }
+
+ protected void setCapacity(int new_cap) {
+ if (new_cap > getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (storage.length - offset > 0) {
+ System.arraycopy(storage, offset, new_data, 0, storage.length
+ - offset);
+ }
+ storage = new_data;
+ offset = 0;
+ }
+ }
+
+ public void reset(int kmerSize) {
+ lettersPerKmer = kmerSize;
+ bytesPerKmer = KmerUtil.getByteNumFromK(lettersPerKmer);
+ storage = EMPTY_BYTES;
+ valueCount = 0;
+ offset = 0;
+ }
+
+ public KmerBytesWritable getPosition(int i) {
+ if (i >= valueCount) {
+ throw new ArrayIndexOutOfBoundsException("No such positions");
+ }
+ posIter.setAsReference(lettersPerKmer, storage, offset + i
+ * bytesPerKmer);
+ return posIter;
+ }
+
+ public void set(KmerListWritable otherList) {
+ this.lettersPerKmer = otherList.lettersPerKmer;
+ this.bytesPerKmer = otherList.bytesPerKmer;
+ set(otherList.valueCount, otherList.storage, otherList.offset);
+ }
+
+ public void set(int valueCount, byte[] newData, int offset) {
+ this.valueCount = valueCount;
+ setSize(valueCount * bytesPerKmer);
+ if (valueCount > 0) {
+ System.arraycopy(newData, offset, storage, this.offset, valueCount
+ * bytesPerKmer);
+ }
+ }
+
+ @Override
+ public Iterator<KmerBytesWritable> iterator() {
+ Iterator<KmerBytesWritable> it = new Iterator<KmerBytesWritable>() {
+
+ private int currentIndex = 0;
+
+ @Override
+ public boolean hasNext() {
+ return currentIndex < valueCount;
+ }
+
+ @Override
+ public KmerBytesWritable next() {
+ return getPosition(currentIndex++);
+ }
+
+ @Override
+ public void remove() {
+ if (currentIndex < valueCount)
+ System.arraycopy(storage, offset + currentIndex
+ * bytesPerKmer, storage, offset
+ + (currentIndex - 1) * bytesPerKmer,
+ (valueCount - currentIndex) * bytesPerKmer);
+ valueCount--;
+ currentIndex--;
+ }
+ };
+ return it;
+ }
+
+ /*
+ * remove the first instance of `toRemove`. Uses a linear scan. Throws an
+ * exception if not in this list.
+ */
+ public void remove(KmerBytesWritable toRemove, boolean ignoreMissing) {
+ Iterator<KmerBytesWritable> posIterator = this.iterator();
+ while (posIterator.hasNext()) {
+ if (toRemove.equals(posIterator.next())) {
+ posIterator.remove();
+ return;
+ }
+ }
+ if (!ignoreMissing) {
+ throw new ArrayIndexOutOfBoundsException("the KmerBytesWritable `"
+ + toRemove.toString() + "` was not found in this list.");
+ }
+ }
+
+ public void remove(KmerBytesWritable toRemove) {
+ remove(toRemove, false);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.valueCount = in.readInt();
+ setSize(valueCount * bytesPerKmer);// kmerByteSize
+ in.readFully(storage, offset, valueCount * bytesPerKmer);// kmerByteSize
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(valueCount);
+ out.write(storage, offset, valueCount * bytesPerKmer);
+ }
+
+ public int getCountOfPosition() {
+ return valueCount;
+ }
+
+ public byte[] getByteArray() {
+ return storage;
+ }
+
+ public int getStartOffset() {
+ return offset;
+ }
+
+ public int getLength() {
+ return valueCount * bytesPerKmer;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sbuilder = new StringBuilder();
+ sbuilder.append('[');
+ for (int i = 0; i < valueCount; i++) {
+ sbuilder.append(getPosition(i).toString());
+ sbuilder.append(',');
+ }
+ if (valueCount > 0) {
+ sbuilder.setCharAt(sbuilder.length() - 1, ']');
+ } else {
+ sbuilder.append(']');
+ }
+ return sbuilder.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return Marshal.hashBytes(getByteArray(), getStartOffset(), getLength());
+ }
}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
index efa87f7..98e37dc 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/NodeWritable.java
@@ -63,7 +63,7 @@
this.forwardReverseList.set(FRList);
this.reverseForwardList.set(RFList);
this.reverseReverseList.set(RRList);
- this.kmer.set(kmer);
+ this.kmer.setAsCopy(kmer);
}
public void reset(int kmerSize) {
@@ -90,7 +90,7 @@
}
public void setKmer(KmerBytesWritable kmer) {
- this.kmer.set(kmer);
+ this.kmer.setAsCopy(kmer);
}
public int getKmerlength() {
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java
index f135292..b056c14 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/PositionListWritable.java
@@ -4,6 +4,7 @@
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@@ -76,6 +77,27 @@
}
}
+ /**
+ * Save the union of my list and otherList. Uses a temporary HashSet for
+ * uniquefication
+ */
+ public void unionUpdate(PositionListWritable otherList) {
+ int newSize = valueCount + otherList.valueCount;
+ HashSet<PositionWritable> uniqueElements = new HashSet<PositionWritable>(
+ newSize);
+ for (PositionWritable pos : this) {
+ uniqueElements.add(pos);
+ }
+ for (PositionWritable pos : otherList) {
+ uniqueElements.add(pos);
+ }
+ valueCount = 0;
+ setSize(newSize);
+ for (PositionWritable pos : uniqueElements) {
+ append(pos);
+ }
+ }
+
public static int getCountByDataLength(int length) {
if (length % PositionWritable.LENGTH != 0) {
throw new IllegalArgumentException("Length of positionlist is invalid");
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
new file mode 100644
index 0000000..7fed5c7
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparator;
+
+import edu.uci.ics.genomix.data.KmerUtil;
+import edu.uci.ics.genomix.data.Marshal;
+
+/**
+ * Variable-length kmer which stores its length internally.
+ *
+ * Note: `offset` as used in this class is the offset at which the *kmer*
+ * begins. There is a {@value HEADER_SIZE}-byte header preceding the kmer
+ */
+public class VKmerBytesWritable extends KmerBytesWritable {
+
+ private static final long serialVersionUID = 1L;
+ protected static final byte[] EMPTY_BYTES = { 0, 0, 0, 0 }; // int
+ // indicating 0
+ // length
+ protected static final int HEADER_SIZE = 4; // number of bytes for header
+ // info
+
+ /**
+ * Initialize as empty kmer
+ */
+ public VKmerBytesWritable() {
+ this(EMPTY_BYTES, HEADER_SIZE);
+ }
+
+ /**
+ * Copy contents of kmer string
+ */
+ public VKmerBytesWritable(String kmer) {
+ bytes = new byte[HEADER_SIZE + KmerUtil.getByteNumFromK(kmer.length())];
+ offset = HEADER_SIZE;
+ setAsCopy(kmer);
+ }
+
+ /**
+ * Set as reference to given data
+ *
+ * @param storage
+ * : byte array with header
+ * @param offset
+ */
+ public VKmerBytesWritable(byte[] storage, int offset) {
+ setAsReference(storage, offset);
+ }
+
+ /**
+ * Reserve space for k letters
+ */
+ public VKmerBytesWritable(int k) {
+ if (k > 0) {
+ bytes = new byte[HEADER_SIZE + KmerUtil.getByteNumFromK(k)];
+ } else {
+ bytes = EMPTY_BYTES;
+ }
+ offset = HEADER_SIZE;
+ setKmerLength(k);
+ }
+
+ /**
+ * deep copy of kmer in other
+ *
+ * @param other
+ */
+ public VKmerBytesWritable(VKmerBytesWritable other) {
+ this(other.lettersInKmer);
+ setAsCopy(other);
+ }
+
+ /**
+ * Deep copy of the given kmer
+ *
+ * @param other
+ */
+ @Override
+ public void setAsCopy(KmerBytesWritable other) {
+ reset(other.lettersInKmer);
+ if (lettersInKmer > 0) {
+ System.arraycopy(other.bytes, other.offset, bytes, this.offset,
+ bytesUsed);
+ }
+ }
+
+ /**
+ * set from String kmer
+ */
+ @Override
+ public void setAsCopy(String kmer) {
+ int k = kmer.length();
+ reset(k);
+ System.arraycopy(kmer.getBytes(), 0, bytes, offset, bytesUsed);
+ }
+
+ /**
+ * Deep copy of the given bytes data
+ *
+ * @param newData
+ * : byte array to copy (should have a header)
+ * @param offset
+ */
+ public void setAsCopy(byte[] newData, int offset) {
+ int k = Marshal.getInt(newData, offset);
+ reset(k);
+ System.arraycopy(newData, offset + HEADER_SIZE, bytes, this.offset,
+ bytesUsed);
+ }
+
+ /**
+ * Point this datablock to the given bytes array It works like the pointer
+ * to new datablock.
+ *
+ * @param newData
+ * : byte array to copy (should have a header)
+ * @param blockOffset
+ */
+ public void setAsReference(byte[] newData, int blockOffset) {
+ this.bytes = newData;
+ this.offset = blockOffset + HEADER_SIZE;
+ int kRequested = Marshal.getInt(newData, blockOffset);
+ int bytesRequested = KmerUtil.getByteNumFromK(kRequested) + HEADER_SIZE;
+ if (newData.length - blockOffset < bytesRequested) {
+ throw new IllegalArgumentException("Requested " + bytesRequested
+ + " bytes (k=" + kRequested + ") but buffer has only "
+ + (newData.length - blockOffset) + " bytes");
+ }
+ setKmerLength(kRequested);
+ }
+
+ @Override
+ public void setKmerLength(int k) {
+ this.bytesUsed = KmerUtil.getByteNumFromK(k);
+ this.lettersInKmer = k;
+ Marshal.putInt(k, bytes, offset - HEADER_SIZE);
+ }
+
+ @Override
+ protected int getCapacity() {
+ return bytes.length - HEADER_SIZE;
+ }
+
+ @Override
+ protected void setCapacity(int new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap + HEADER_SIZE];
+ if (new_cap < bytesUsed) {
+ bytesUsed = new_cap;
+ }
+ if (bytesUsed != 0) {
+ System.arraycopy(bytes, offset, new_data, HEADER_SIZE,
+ bytesUsed);
+ }
+ bytes = new_data;
+ offset = HEADER_SIZE;
+ }
+ }
+
+ @Override
+ public void mergeWithFFKmer(int initialKmerSize, KmerBytesWritable kmer) {
+ super.mergeWithFFKmer(initialKmerSize, kmer);
+ Marshal.putInt(lettersInKmer, bytes, offset - HEADER_SIZE);
+ }
+
+ @Override
+ public void mergeWithFRKmer(int initialKmerSize, KmerBytesWritable kmer) {
+ super.mergeWithFRKmer(initialKmerSize, kmer);
+ Marshal.putInt(lettersInKmer, bytes, offset - HEADER_SIZE);
+ }
+
+ @Override
+ public void mergeWithRFKmer(int initialKmerSize, KmerBytesWritable preKmer) {
+ super.mergeWithRFKmer(initialKmerSize, preKmer);
+ Marshal.putInt(lettersInKmer, bytes, offset - HEADER_SIZE);
+ }
+
+ @Override
+ public void mergeWithRRKmer(int initialKmerSize, KmerBytesWritable preKmer) {
+ super.mergeWithRRKmer(initialKmerSize, preKmer);
+ Marshal.putInt(lettersInKmer, bytes, offset - HEADER_SIZE);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ lettersInKmer = in.readInt();
+ bytesUsed = KmerUtil.getByteNumFromK(lettersInKmer);
+ if (lettersInKmer > 0) {
+ if (getCapacity() < this.bytesUsed) {
+ this.bytes = new byte[this.bytesUsed + HEADER_SIZE];
+ this.offset = HEADER_SIZE;
+ }
+ in.readFully(bytes, offset, bytesUsed);
+ }
+ Marshal.putInt(lettersInKmer, bytes, offset - HEADER_SIZE);
+ }
+
+ /**
+ * write the entire byte array including the header
+ */
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.write(bytes, offset - HEADER_SIZE, bytesUsed + HEADER_SIZE);
+ }
+
+ @Override
+ public boolean equals(Object right) {
+ if (right instanceof VKmerBytesWritable) {
+ return super.equals(right); // compare bytes directly
+ } else if (right instanceof KmerBytesWritable) {
+ // for Kmers, we need to skip our header
+ KmerBytesWritable rightKmer = (KmerBytesWritable) right;
+ if (lettersInKmer != rightKmer.lettersInKmer) { // check length
+ return false;
+ }
+ for (int i = 0; i < lettersInKmer; i++) { // check letters
+ if (bytes[i + HEADER_SIZE] != rightKmer.bytes[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return KmerUtil.recoverKmerFrom(this.lettersInKmer, this.getBytes(),
+ offset, this.getLength());
+ }
+
+ public static class Comparator extends WritableComparator {
+
+ public Comparator() {
+ super(VKmerBytesWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = Marshal.getInt(b1, s1);
+ int kmerlength2 = Marshal.getInt(b2, s2);
+ if (kmerlength1 == kmerlength2) {
+ return compareBytes(b1, s1 + HEADER_SIZE, l1 - HEADER_SIZE, b2,
+ s2 + HEADER_SIZE, l2 - HEADER_SIZE);
+ }
+ return kmerlength1 - kmerlength2;
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(VKmerBytesWritable.class, new Comparator());
+ }
+
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
index fbfbeeb..807ac13 100644
--- a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/KmerBytesWritableTest.java
@@ -131,7 +131,7 @@
Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
for (int i = 1; i < 8; i++) {
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithFFKmer(i, kmer2);
Assert.assertEquals(text1 + text2.substring(i - 1), merge.toString());
}
@@ -147,7 +147,7 @@
Assert.assertEquals(text1, kmer1.toString());
Assert.assertEquals(text2, kmer2.toString());
for (int x = 1; x < jk; x++) {
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithFFKmer(x, kmer2);
Assert.assertEquals(text1 + text2.substring(x - 1), merge.toString());
}
@@ -177,17 +177,17 @@
Assert.assertEquals(result, merge.toString());
int i = 1;
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithFRKmer(i, kmer2);
Assert.assertEquals("AAGCTAAAACAACC", merge.toString());
i = 2;
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithFRKmer(i, kmer2);
Assert.assertEquals("AAGCTAAACAACC", merge.toString());
i = 3;
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithFRKmer(i, kmer2);
Assert.assertEquals("AAGCTAACAACC", merge.toString());
}
@@ -215,48 +215,38 @@
Assert.assertEquals(result, merge.toString());
int i = 1;
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithRFKmer(i, kmer2);
Assert.assertEquals("GGCACAAAACAACCC", merge.toString());
i = 2;
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithRFKmer(i, kmer2);
Assert.assertEquals("GGCACAAACAACCC", merge.toString());
i = 3;
- merge.set(kmer1);
+ merge.setAsCopy(kmer1);
merge.mergeWithRFKmer(i, kmer2);
Assert.assertEquals("GGCACAACAACCC", merge.toString());
- String test1;
- String test2;
- test1 = "CTA";
- test2 = "AGA";
- KmerBytesWritable k1 = new KmerBytesWritable(3);
- KmerBytesWritable k2 = new KmerBytesWritable(3);
- k1.setByRead(test1.getBytes(), 0);
- k2.setByRead(test2.getBytes(), 0);
- k1.mergeWithRFKmer(3, k2);
- Assert.assertEquals("TCTA", k1.toString());
+// String test1 = "CTTAT";
+// String test2 = "AGACC"; // rc = GGTCT
+// KmerBytesWritable k1 = new KmerBytesWritable(5);
+// KmerBytesWritable k2 = new KmerBytesWritable(5);
+// k1.setByRead(test1.getBytes(), 0);
+// k2.setByRead(test2.getBytes(), 0);
+// k1.mergeWithRFKmer(3, k2);
+// Assert.assertEquals("GGTCTTAT", k1.toString()); //GGTCGTCT -> AGACGACC ??
- test1 = "CTA";
- test2 = "ATA"; //TAT
- k1 = new KmerBytesWritable(3);
- k2 = new KmerBytesWritable(3);
- k1.setByRead(test1.getBytes(), 0);
- k2.setByRead(test2.getBytes(), 0);
- k1.mergeWithFRKmer(3, k2);
- Assert.assertEquals("CTAT", k1.toString());
-
- test1 = "ATA";
- test2 = "CTA"; //TAT
- k1 = new KmerBytesWritable(3);
- k2 = new KmerBytesWritable(3);
- k1.setByRead(test1.getBytes(), 0);
- k2.setByRead(test2.getBytes(), 0);
- k1.mergeWithFRKmer(3, k2);
- Assert.assertEquals("ATAG", k1.toString());
+ String test3 = "CTA";
+ String test4 = "AGA"; // rc = TCT
+ KmerBytesWritable k3 = new KmerBytesWritable(3);
+ KmerBytesWritable k4 = new KmerBytesWritable(3);
+ k3.setByRead(test3.getBytes(), 0);
+ k4.setByRead(test4.getBytes(), 0);
+ k3.mergeWithRFKmer(3, k4);
+ Assert.assertEquals("TCTA", k3.toString());
+// Assert.assertEquals("CTAT", k3); // this is an incorrect test case-- the merge always flips the passed-in kmer
}
@@ -278,7 +268,7 @@
Assert.assertEquals(text1 + text2.substring(kmerSize - 1), merge.toString());
for (int i = 1; i < 8; i++) {
- merge.set(kmer2);
+ merge.setAsCopy(kmer2);
merge.mergeWithRRKmer(i, kmer1);
Assert.assertEquals(text1.substring(0, text1.length() - i + 1) + text2, merge.toString());
}
@@ -294,62 +284,30 @@
Assert.assertEquals(text1, kmer1.toString());
Assert.assertEquals(text2, kmer2.toString());
for (int x = 1; x < ik; x++) {
- merge.set(kmer2);
+ merge.setAsCopy(kmer2);
merge.mergeWithRRKmer(x, kmer1);
Assert.assertEquals(text1.substring(0, text1.length() - x + 1) + text2, merge.toString());
}
}
}
}
-
+
@Test
- public void TestFinalMerge() {
- String selfString;
- String match;
- String msgString;
- int index;
- KmerBytesWritable kmer = new KmerBytesWritable();
- int kmerSize = 3;
-
- String F1 = "AATAG";
- String F2 = "TAGAA";
- String R1 = "CTATT";
- String R2 = "TTCTA";
-
- //FF test
- selfString = F1;
- match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
- msgString = F2;
- index = msgString.indexOf(match);
- kmer.reset(msgString.length() - index);
- kmer.setByRead(msgString.substring(index).getBytes(), 0);
- System.out.println(kmer.toString());
-
- //FR test
- selfString = F1;
- match = selfString.substring(selfString.length() - kmerSize + 1,selfString.length());
- msgString = GeneCode.reverseComplement(R2);
- index = msgString.indexOf(match);
- kmer.reset(msgString.length() - index);
- kmer.setByRead(msgString.substring(index).getBytes(), 0);
- System.out.println(kmer.toString());
-
- //RF test
- selfString = R1;
- match = selfString.substring(0,kmerSize - 1);
- msgString = GeneCode.reverseComplement(F2);
- index = msgString.lastIndexOf(match) + kmerSize - 2;
- kmer.reset(index + 1);
- kmer.setByReadReverse(msgString.substring(0, index + 1).getBytes(), 0);
- System.out.println(kmer.toString());
-
- //RR test
- selfString = R1;
- match = selfString.substring(0,kmerSize - 1);
- msgString = R2;
- index = msgString.lastIndexOf(match) + kmerSize - 2;
- kmer.reset(index + 1);
- kmer.setByRead(msgString.substring(0, index + 1).getBytes(), 0);
- System.out.println(kmer.toString());
+ public void TestMergeRFAndRRKmer() {
+ String test1 = "TAGAT";
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "GCTAG";
+ KmerBytesWritable k1 = new KmerBytesWritable(5);
+ KmerBytesWritable k2 = new KmerBytesWritable(5);
+ KmerBytesWritable k3 = new KmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k1.mergeWithRFKmer(5, k2);
+ Assert.assertEquals("CTAGAT", k1.toString());
+ k1.mergeWithRRKmer(5, k3);
+ Assert.assertEquals("GCTAGAT", k1.toString());
}
}
+
+
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/VKmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/VKmerBytesWritableTest.java
new file mode 100644
index 0000000..a50e465
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/data/test/VKmerBytesWritableTest.java
@@ -0,0 +1,409 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.test;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class VKmerBytesWritableTest {
+ static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
+ static int k = 7;
+
+ @Test
+ public void TestCompressKmer() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByRead(array, 1);
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestMoveKmer() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithNextCode(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithNextChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestCompressKmerReverse() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByReadReverse(array, 1);
+ Assert.assertEquals(kmer.toString(), "CTTCTAT");
+ }
+
+ @Test
+ public void TestMoveKmerReverse() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithPreChar(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithPreChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "GAATAGA");
+ }
+
+ @Test
+ public void TestGetGene() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(9);
+ String text = "AGCTGACCG";
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G' };
+ kmer.setByRead(array, 0);
+
+ for (int i = 0; i < 9; i++) {
+ Assert.assertEquals(text.charAt(i), (char) (GeneCode
+ .getSymbolFromCode(kmer.getGeneCodeAtPosition(i))));
+ }
+ }
+
+ @Test
+ public void TestGetOneByteFromKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String string = "AGCTGACCGT";
+ for (int k = 3; k <= 10; k++) {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(k);
+ VKmerBytesWritable kmerAppend = new VKmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(string.substring(0, k), kmer.toString());
+ for (int b = 0; b < k; b++) {
+ byte byteActual = VKmerBytesWritable
+ .getOneByteFromKmerAtPosition(b, kmer.getBytes(),
+ kmer.getOffset(), kmer.getLength());
+ byte byteExpect = GeneCode.getCodeFromSymbol(array[b]);
+ for (int i = 1; i < 4 && b + i < k; i++) {
+ byteExpect += GeneCode.getCodeFromSymbol(array[b + i]) << (i * 2);
+ }
+ Assert.assertEquals(byteActual, byteExpect);
+ VKmerBytesWritable.appendOneByteAtPosition(b, byteActual,
+ kmerAppend.getBytes(), kmerAppend.getOffset(),
+ kmerAppend.getLength());
+ }
+ Assert.assertEquals(kmer.toString(), kmerAppend.toString());
+ }
+ }
+
+ @Test
+ public void TestMergeFFKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String text = "AGCTGACCGT";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(8);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACC";
+ Assert.assertEquals(text1, kmer1.toString());
+
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(8);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCG";
+ Assert.assertEquals(text2, kmer2.toString());
+
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer1);
+ int kmerSize = 8;
+ merge.mergeWithFFKmer(kmerSize, kmer2);
+ Assert.assertEquals(text1 + text2.substring(kmerSize - 1),
+ merge.toString());
+
+ for (int i = 1; i < 8; i++) {
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFFKmer(i, kmer2);
+ Assert.assertEquals(text1 + text2.substring(i - 1),
+ merge.toString());
+ }
+
+ for (int ik = 1; ik <= 10; ik++) {
+ for (int jk = 1; jk <= 10; jk++) {
+ kmer1 = new VKmerBytesWritable(ik);
+ kmer2 = new VKmerBytesWritable(jk);
+ kmer1.setByRead(array, 0);
+ kmer2.setByRead(array, 0);
+ text1 = text.substring(0, ik);
+ text2 = text.substring(0, jk);
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+ for (int x = 1; x < jk; x++) {
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFFKmer(x, kmer2);
+ Assert.assertEquals(text1 + text2.substring(x - 1),
+ merge.toString());
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestMergeFRKmer() {
+ int kmerSize = 3;
+ String result = "AAGCTAACAACC";
+ byte[] resultArray = result.getBytes();
+
+ String text1 = "AAGCTAA";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(text1.length());
+ kmer1.setByRead(resultArray, 0);
+ Assert.assertEquals(text1, kmer1.toString());
+
+ // kmer2 is the rc of the end of the read
+ String text2 = "GGTTGTT";
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(text2.length());
+ kmer2.setByReadReverse(resultArray, result.length() - text2.length());
+ Assert.assertEquals(text2, kmer2.toString());
+
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer1);
+ merge.mergeWithFRKmer(kmerSize, kmer2);
+ Assert.assertEquals(result, merge.toString());
+
+ int i = 1;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAAAACAACC", merge.toString());
+
+ i = 2;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAAACAACC", merge.toString());
+
+ i = 3;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithFRKmer(i, kmer2);
+ Assert.assertEquals("AAGCTAACAACC", merge.toString());
+ }
+
+ @Test
+ public void TestMergeRFKmer() {
+ int kmerSize = 3;
+ String result = "GGCACAACAACCC";
+ byte[] resultArray = result.getBytes();
+
+ String text1 = "AACAACCC";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(text1.length());
+ kmer1.setByRead(resultArray, 5);
+ Assert.assertEquals(text1, kmer1.toString());
+
+ // kmer2 is the rc of the end of the read
+ String text2 = "TTGTGCC";
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(text2.length());
+ kmer2.setByReadReverse(resultArray, 0);
+ Assert.assertEquals(text2, kmer2.toString());
+
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer1);
+ merge.mergeWithRFKmer(kmerSize, kmer2);
+ Assert.assertEquals(result, merge.toString());
+
+ int i = 1;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAAAACAACCC", merge.toString());
+
+ i = 2;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAAACAACCC", merge.toString());
+
+ i = 3;
+ merge.setAsCopy(kmer1);
+ merge.mergeWithRFKmer(i, kmer2);
+ Assert.assertEquals("GGCACAACAACCC", merge.toString());
+
+ // String test1 = "CTTAT";
+ // String test2 = "AGACC"; // rc = GGTCT
+ // VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ // VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ // k1.setByRead(test1.getBytes(), 0);
+ // k2.setByRead(test2.getBytes(), 0);
+ // k1.mergeWithRFKmer(3, k2);
+ // Assert.assertEquals("GGTCTTAT", k1.toString()); //GGTCGTCT ->
+ // AGACGACC ??
+
+ String test3 = "CTA";
+ String test4 = "AGA"; // rc = TCT
+ VKmerBytesWritable k3 = new VKmerBytesWritable(3);
+ VKmerBytesWritable k4 = new VKmerBytesWritable(3);
+ k3.setByRead(test3.getBytes(), 0);
+ k4.setByRead(test4.getBytes(), 0);
+ k3.mergeWithRFKmer(3, k4);
+ Assert.assertEquals("TCTA", k3.toString());
+ // Assert.assertEquals("CTAT", k3); // this is an incorrect test case--
+ // the merge always flips the passed-in kmer
+ }
+
+ @Test
+ public void TestMergeRRKmer() {
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+ String text = "AGCTGACCGT";
+ VKmerBytesWritable kmer1 = new VKmerBytesWritable(8);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACC";
+ VKmerBytesWritable kmer2 = new VKmerBytesWritable(8);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCG";
+ Assert.assertEquals(text2, kmer2.toString());
+ VKmerBytesWritable merge = new VKmerBytesWritable(kmer2);
+ int kmerSize = 8;
+ merge.mergeWithRRKmer(kmerSize, kmer1);
+ Assert.assertEquals(text1 + text2.substring(kmerSize - 1),
+ merge.toString());
+
+ for (int i = 1; i < 8; i++) {
+ merge.setAsCopy(kmer2);
+ merge.mergeWithRRKmer(i, kmer1);
+ Assert.assertEquals(text1.substring(0, text1.length() - i + 1)
+ + text2, merge.toString());
+ }
+
+ for (int ik = 1; ik <= 10; ik++) {
+ for (int jk = 1; jk <= 10; jk++) {
+ kmer1 = new VKmerBytesWritable(ik);
+ kmer2 = new VKmerBytesWritable(jk);
+ kmer1.setByRead(array, 0);
+ kmer2.setByRead(array, 0);
+ text1 = text.substring(0, ik);
+ text2 = text.substring(0, jk);
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+ for (int x = 1; x < ik; x++) {
+ merge.setAsCopy(kmer2);
+ merge.mergeWithRRKmer(x, kmer1);
+ Assert.assertEquals(
+ text1.substring(0, text1.length() - x + 1) + text2,
+ merge.toString());
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestMergeRFAndRRKmer() {
+ String test1 = "TAGAT";
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "GCTAG";
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k1.mergeWithRFKmer(5, k2);
+ Assert.assertEquals("CTAGAT", k1.toString());
+ k1.mergeWithRRKmer(5, k3);
+ Assert.assertEquals("GCTAGAT", k1.toString());
+ }
+
+ @Test
+ public void TestMergeRFAndRFKmer() {
+ String test1 = "TAGAT";
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k1.mergeWithRFKmer(5, k2);
+ Assert.assertEquals("CTAGAT", k1.toString());
+ k1.mergeWithRFKmer(5, k3);
+ Assert.assertEquals("GCTAGAT", k1.toString());
+ }
+
+ @Test
+ public void TestMergeRFAndFRKmer() {
+ String test1 = "TAGAT"; // rc = ATCTA
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "GCTAG"; // rc = CTAGC
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k2.mergeWithRFKmer(5, k1);
+ Assert.assertEquals("ATCTAG", k2.toString());
+ k2.mergeWithFRKmer(5, k3);
+ Assert.assertEquals("ATCTAGC", k2.toString());
+ }
+
+ @Test
+ public void TestMergeRFAndFFKmer() {
+ String test1 = "TAGAT"; // rc = ATCTA
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ VKmerBytesWritable k1 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k2.mergeWithRFKmer(5, k1);
+ Assert.assertEquals("ATCTAG", k2.toString());
+ k2.mergeWithFFKmer(5, k3);
+ Assert.assertEquals("ATCTAGC", k2.toString());
+ }
+
+ @Test
+ public void TestMergeKmerAndVKmer() {
+ String test1 = "TAGAT"; // rc = ATCTA
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ KmerBytesWritable k1 = new KmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k2.mergeWithRFKmer(5, k1);
+ Assert.assertEquals("ATCTAG", k2.toString());
+ k2.mergeWithFFKmer(5, k3);
+ Assert.assertEquals("ATCTAGC", k2.toString());
+ }
+
+ @Test
+ public void TestMergeKmerAndVKmerRFAndRFKmer() {
+ String test1 = "TAGAT";
+ String test2 = "TCTAG"; // rc = CTAGA
+ String test3 = "CTAGC"; // rc = GCTAG
+ KmerBytesWritable k1 = new KmerBytesWritable(5);
+ VKmerBytesWritable k2 = new VKmerBytesWritable(5);
+ VKmerBytesWritable k3 = new VKmerBytesWritable(5);
+ k1.setByRead(test1.getBytes(), 0);
+ k2.setByRead(test2.getBytes(), 0);
+ k3.setByRead(test3.getBytes(), 0);
+ k1.mergeWithRFKmer(5, k2);
+ Assert.assertEquals("CTAGAT", k1.toString());
+ k1.mergeWithRFKmer(5, k3);
+ Assert.assertEquals("GCTAGAT", k1.toString());
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java
index b101312..5b612dc 100644
--- a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/dataflow/ReadsKeyValueParserFactory.java
@@ -46,6 +46,7 @@
public static final int OutputKmerField = 0;
public static final int outputNodeIdListField = 1;
+
public static final int OutputForwardForwardField = 2;
public static final int OutputFFListCountField = 3;
public static final int OutputForwardReverseField = 4;
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/GenomixJobConf.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/GenomixJobConf.java
new file mode 100644
index 0000000..b0edf77
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/hyracks/newgraph/job/GenomixJobConf.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.hyracks.newgraph.job;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+
+@SuppressWarnings("deprecation")
+public class GenomixJobConf extends JobConf {
+
+ public static final String JOB_NAME = "genomix";
+
+ /** Kmers length */
+ public static final String KMER_LENGTH = "genomix.kmerlen";
+ /** Read length */
+ public static final String READ_LENGTH = "genomix.readlen";
+ /** Frame Size */
+ public static final String FRAME_SIZE = "genomix.framesize";
+ /** Frame Limit, hyracks need */
+ public static final String FRAME_LIMIT = "genomix.framelimit";
+ /** Table Size, hyracks need */
+ public static final String TABLE_SIZE = "genomix.tablesize";
+ /** Groupby types */
+ public static final String GROUPBY_TYPE = "genomix.graph.groupby.type";
+ /** Graph outputformat */
+ public static final String OUTPUT_FORMAT = "genomix.graph.output";
+ /** Get reversed Kmer Sequence */
+ public static final String REVERSED_KMER = "genomix.kmer.reversed";
+
+ /** Configurations used by hybrid groupby function in graph build phrase */
+ public static final String GROUPBY_HYBRID_INPUTSIZE = "genomix.graph.groupby.hybrid.inputsize";
+ public static final String GROUPBY_HYBRID_INPUTKEYS = "genomix.graph.groupby.hybrid.inputkeys";
+ public static final String GROUPBY_HYBRID_RECORDSIZE_SINGLE = "genomix.graph.groupby.hybrid.recordsize.single";
+ public static final String GROUPBY_HYBRID_RECORDSIZE_CROSS = "genomix.graph.groupby.hybrid.recordsize.cross";
+ public static final String GROUPBY_HYBRID_HASHLEVEL = "genomix.graph.groupby.hybrid.hashlevel";
+
+ public static final int DEFAULT_KMERLEN = 21;
+ public static final int DEFAULT_READLEN = 124;
+ public static final int DEFAULT_FRAME_SIZE = 128 * 1024;
+ public static final int DEFAULT_FRAME_LIMIT = 4096;
+ public static final int DEFAULT_TABLE_SIZE = 10485767;
+ public static final long DEFAULT_GROUPBY_HYBRID_INPUTSIZE = 154000000L;
+ public static final long DEFAULT_GROUPBY_HYBRID_INPUTKEYS = 38500000L;
+ public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_SINGLE = 9;
+ public static final int DEFAULT_GROUPBY_HYBRID_HASHLEVEL = 1;
+ public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS = 13;
+
+ public static final boolean DEFAULT_REVERSED = true;
+
+ public static final String JOB_PLAN_GRAPHBUILD = "graphbuild";
+ public static final String JOB_PLAN_GRAPHSTAT = "graphstat";
+
+ public static final String GROUPBY_TYPE_HYBRID = "hybrid";
+ public static final String GROUPBY_TYPE_EXTERNAL = "external";
+ public static final String GROUPBY_TYPE_PRECLUSTER = "precluster";
+ public static final String OUTPUT_FORMAT_BINARY = "binary";
+ public static final String OUTPUT_FORMAT_TEXT = "text";
+
+ public GenomixJobConf() throws IOException {
+ super(new Configuration());
+ }
+
+ public GenomixJobConf(Configuration conf) throws IOException {
+ super(conf);
+ }
+
+ /**
+ * Set the kmer length
+ *
+ * @param the
+ * desired frame kmerByteSize
+ */
+ final public void setKmerLength(int kmerlength) {
+ setInt(KMER_LENGTH, kmerlength);
+ }
+
+ final public void setFrameSize(int frameSize) {
+ setInt(FRAME_SIZE, frameSize);
+ }
+
+ final public void setFrameLimit(int frameLimit) {
+ setInt(FRAME_LIMIT, frameLimit);
+ }
+
+ final public void setTableSize(int tableSize) {
+ setInt(TABLE_SIZE, tableSize);
+ }
+
+}