add script for simple random read generation
diff --git a/.gitignore b/.gitignore
index 774c318..f7ed97c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@
teststore
.DS_Store
bin
+.idea
diff --git a/genomix/genomix-hadoop/src/test/python/convert_graphviz.py b/genomix/genomix-hadoop/src/test/python/convert_graphviz.py
index fe9b778..4130052 100755
--- a/genomix/genomix-hadoop/src/test/python/convert_graphviz.py
+++ b/genomix/genomix-hadoop/src/test/python/convert_graphviz.py
@@ -100,7 +100,7 @@
parser.add_argument('txt_graphs', nargs='*')
parser.add_argument('--directory', '-d', help='Recurse here and plot all '
- 'graphs that are found.', nargs='+', default=[])
+ 'graphs that are found.', action='append', default=[])
return parser
diff --git a/genomix/genomix-hadoop/src/test/python/generate_graph.py b/genomix/genomix-hadoop/src/test/python/generate_graph.py
new file mode 100644
index 0000000..2053bf9
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/python/generate_graph.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Generate a random smattering of reads
+"""
+
+import sys
+import argparse
+import random
+import itertools
+import string
+
+
+def get_parser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--walk', '-w', action='store_true')
+ parser.add_argument('--coverage', '-c', type=float, required=True)
+ parser.add_argument('--genome-length', '-g', type=int,
+ required=True)
+ parser.add_argument('--read-length', '-l', type=int, required=True)
+ parser.add_argument('--no-rc', action='store_true')
+ parser.add_argument('--error-rate', type=float, default=.01)
+ parser.add_argument('--outreads', '-r', type=argparse.FileType('w'),
+ default='reads.txt')
+ parser.add_argument('--outgenome', '-o', type=argparse.FileType('w'),
+ default='genome.txt')
+ return parser
+
+
+def reverse_complement(kmer, _table=string.maketrans('ACGT', 'TGCA')):
+ return string.translate(kmer, _table)[::-1]
+
+
+def make_genome(length):
+ return ''.join(random.choice('ACGT') for i in xrange(length))
+
+
+def make_reads(genome, read_length, coverage, walk=False, no_rc=False,
+ error_rate=0.):
+ num_reads = int(coverage * len(genome)) / read_length
+ if walk:
+ step_size = max(1, int(len(genome) / num_reads))
+ next_starts = itertools.cycle(xrange(0, len(genome) - read_length + 1,
+ step_size))
+ else:
+ next_starts = (random.randrange(len(genome) - read_length) for i in itertools.cycle([None]))
+ num_errors = 0
+ for i in range(1, num_reads + 1):
+ start = next_starts.next()
+ seq = genome[start:start + read_length]
+ if not no_rc and random.choice([True, False]):
+ seq = reverse_complement(seq)
+ final_seq = []
+ for l in seq:
+ if random.random() < error_rate:
+ num_errors += 1
+ final_seq.append(random.choice(list(set('ATGC') - set(l))))
+ else:
+ final_seq.append(l)
+
+ yield '%s\t%s\n' % (i, ''.join(final_seq))
+ print >> sys.stderr, 'introduced', num_errors, 'errors'
+
+
+def main(args):
+ parser = get_parser()
+ args = parser.parse_args(args)
+ genome = make_genome(args.genome_length)
+ args.outgenome.write(genome)
+ args.outgenome.write('\n')
+ args.outreads.writelines(make_reads(genome, args.read_length,
+ args.coverage, args.walk, args.no_rc,
+ args.error_rate))
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff --git a/genomix/genomix-hadoop/src/test/resources/data/sequence/walk_random_seq1.txt b/genomix/genomix-hadoop/src/test/resources/data/sequence/walk_random_seq1.txt
new file mode 100644
index 0000000..35f1c49
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/resources/data/sequence/walk_random_seq1.txt
@@ -0,0 +1,37 @@
+1 TAGTGCGA
+2 CCTCGCAC
+3 GCTAGGGT
+4 GAGGGTTG
+5 AGCAACCC
+6 GTTGCTGA
+7 TTTCAGCA
+8 CTGAAATC
+9 CAGATTTC
+10 GGCAGATT
+11 CTGGCAGA
+12 CTCTGGCA
+13 ATCTCTGG
+14 GCATCTCT
+15 CGGCATCT
+16 AACGGCAT
+17 GAAACGGC
+18 CGTTTCAA
+19 TATTGAAA
+20 TCAATACG
+21 AATACGTG
+22 TACGTGAA
+23 GTTTCACG
+24 TGAAACTA
+25 AAACTATT
+26 GTAATAGT
+27 TATTACGT
+28 TTACGTCA
+29 CATGACGT
+30 GTCATGAC
+31 GCGTCATG
+32 AAGCGTCA
+33 TCGCTTAA
+34 GCTTAAGC
+35 TCGCTTAA
+36 AAGCGTGT
+37 CCACACGC
diff --git a/genomix/genomix-hadoop/src/test/resources/data/sequence/walk_random_seq2.txt b/genomix/genomix-hadoop/src/test/resources/data/sequence/walk_random_seq2.txt
new file mode 100644
index 0000000..d65f7c0
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/resources/data/sequence/walk_random_seq2.txt
@@ -0,0 +1,6 @@
+1 AATGCGCT
+2 CTAGCGCA
+3 CGCTAGGA
+4 CTAGGAGT
+5 AGGAGTTG
+6 AGCGCATT