blob: 2053bf93325c1c4ba5dfdf08a414492074682e36 [file] [log] [blame]
Jake Biesingerac4a3fb2013-07-11 18:29:09 -07001#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""
4Generate a random smattering of reads
5"""
6
7import sys
8import argparse
9import random
10import itertools
11import string
12
13
14def get_parser():
15 parser = argparse.ArgumentParser()
16 parser.add_argument('--walk', '-w', action='store_true')
17 parser.add_argument('--coverage', '-c', type=float, required=True)
18 parser.add_argument('--genome-length', '-g', type=int,
19 required=True)
20 parser.add_argument('--read-length', '-l', type=int, required=True)
21 parser.add_argument('--no-rc', action='store_true')
22 parser.add_argument('--error-rate', type=float, default=.01)
23 parser.add_argument('--outreads', '-r', type=argparse.FileType('w'),
24 default='reads.txt')
25 parser.add_argument('--outgenome', '-o', type=argparse.FileType('w'),
26 default='genome.txt')
27 return parser
28
29
30def reverse_complement(kmer, _table=string.maketrans('ACGT', 'TGCA')):
31 return string.translate(kmer, _table)[::-1]
32
33
34def make_genome(length):
35 return ''.join(random.choice('ACGT') for i in xrange(length))
36
37
38def make_reads(genome, read_length, coverage, walk=False, no_rc=False,
39 error_rate=0.):
40 num_reads = int(coverage * len(genome)) / read_length
41 if walk:
42 step_size = max(1, int(len(genome) / num_reads))
43 next_starts = itertools.cycle(xrange(0, len(genome) - read_length + 1,
44 step_size))
45 else:
46 next_starts = (random.randrange(len(genome) - read_length) for i in itertools.cycle([None]))
47 num_errors = 0
48 for i in range(1, num_reads + 1):
49 start = next_starts.next()
50 seq = genome[start:start + read_length]
51 if not no_rc and random.choice([True, False]):
52 seq = reverse_complement(seq)
53 final_seq = []
54 for l in seq:
55 if random.random() < error_rate:
56 num_errors += 1
57 final_seq.append(random.choice(list(set('ATGC') - set(l))))
58 else:
59 final_seq.append(l)
60
61 yield '%s\t%s\n' % (i, ''.join(final_seq))
62 print >> sys.stderr, 'introduced', num_errors, 'errors'
63
64
65def main(args):
66 parser = get_parser()
67 args = parser.parse_args(args)
68 genome = make_genome(args.genome_length)
69 args.outgenome.write(genome)
70 args.outgenome.write('\n')
71 args.outreads.writelines(make_reads(genome, args.read_length,
72 args.coverage, args.walk, args.no_rc,
73 args.error_rate))
74
75
76if __name__ == '__main__':
77 main(sys.argv[1:])