hadoop pathmerge h1 algorithm
git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@3377 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
index 133e7f8..21fee2b 100644
--- a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/Kmer.java
@@ -15,7 +15,7 @@
package edu.uci.ics.genomix.type;
public class Kmer {
-
+
public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
public final static class GENE_CODE {
diff --git a/genomix/genomix-hadoop/expected/result2 b/genomix/genomix-hadoop/expected/result2
index 3665e18..db55a38 100755
--- a/genomix/genomix-hadoop/expected/result2
+++ b/genomix/genomix-hadoop/expected/result2
@@ -1,3 +1,8 @@
-39 41 0c 1 1
-e4 04 31 24 1
-93 13 c4 16 1
+GCA -72
+AGC 1
+CGC -119
+TGC 1
+ATC 36
+TCG 18
+CAT 66
+GCT 32
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
index 1a33c7a..5d39928 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
@@ -42,8 +42,9 @@
byte bytCount = 0;
while (values.hasNext()) {
//Merge By the all adjacent Nodes;
- groupByAdjList = (byte) (groupByAdjList | values.next().getAdjBitMap());
- count ++;
+ KmerCountValue geneValue = values.next();
+ groupByAdjList = (byte) (groupByAdjList | geneValue.getAdjBitMap());
+ count = count + (int) geneValue.getCount();
}
if (count >= 127)
bytCount = (byte) 127;
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
index 6e55f09..b90ab23 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
@@ -28,8 +28,6 @@
import org.apache.hadoop.mapred.TextInputFormat;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
-
-import edu.uci.ics.genomix.type.KmerBytesWritable;
import edu.uci.ics.genomix.type.KmerCountValue;
/**
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
index 443cceb..cd8b7e3 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
@@ -55,20 +55,20 @@
/*succeed node
A 00000001 1
- G 00000010 2
- C 00000100 4
+ C 00000010 2
+ G 00000100 4
T 00001000 8
precursor node
A 00010000 16
- G 00100000 32
- C 01000000 64
+ C 00100000 32
+ G 01000000 64
T 10000000 128*/
@Override
public void map(LongWritable key, Text value, OutputCollector<BytesWritable, KmerCountValue> output,
Reporter reporter) throws IOException {
/* A 00
- G 01
- C 10
+ C 01
+ G 10
T 11*/
String geneLine = value.toString(); // Read the Real Gene Line
Pattern genePattern = Pattern.compile("[AGCT]+");
@@ -76,7 +76,7 @@
boolean isValid = geneMatcher.matches();
if (isValid == true) {
/** first kmer */
- byte count = 0;
+ byte count = 1;
byte[] array = geneLine.getBytes();
byte[] kmer = Kmer.compressKmer(KMER_SIZE, array, 0);
byte pre = 0;
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
index 03bad56..676d6f1 100755
--- a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
@@ -41,8 +41,7 @@
byte bytCount = 0;
while (values.hasNext()) {
//Merge By the all adjacent Nodes;
- KmerCountValue geneValue = values.next();
-
+ KmerCountValue geneValue = values.next();
groupByAdjList = (byte) (groupByAdjList | geneValue.getAdjBitMap());
count = count + (int) geneValue.getCount();
}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
index 28cbbbc..6bd3bd5 100755
--- a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
@@ -35,23 +35,24 @@
import edu.uci.ics.genomix.type.Kmer;
import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.utils.TestUtils;
/**
* This class test the correctness of graphbuilding program
*/
@SuppressWarnings("deprecation")
public class GraphBuildingTest {
- private static final String ACTUAL_RESULT_DIR = "actual";
- @SuppressWarnings("deprecation")
+ private static final String ACTUAL_RESULT_DIR = "actual1";
+ private static final String COMPARE_DIR = "compare";
private JobConf conf = new JobConf();
private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
- private static final String DATA_PATH = "data/webmap/text.txt";
+ private static final String DATA_PATH = "data/webmap/Test.txt";
private static final String HDFS_PATH = "/webmap";
- private static final String RESULT_PATH = "/result2";
- private static final String EXPECTED_PATH = "expected/result2";
- private static final String TEST_SOURCE_DIR = "testactual/source.txt";
+ private static final String RESULT_PATH = "/result1";
+ private static final String EXPECTED_PATH = "expected/result1";
+ private static final String TEST_SOURCE_DIR = COMPARE_DIR + RESULT_PATH + "/comparesource.txt";
private static final int COUNT_REDUCER = 4;
- private static final int SIZE_KMER = 12;
+ private static final int SIZE_KMER = 3;
private MiniDFSCluster dfsCluster;
private MiniMRCluster mrCluster;
@@ -70,11 +71,12 @@
SequenceFile.Reader reader = null;
Path path = new Path(RESULT_PATH + "/part-00000");
- reader = new SequenceFile.Reader(dfs, path, conf);
+ reader = new SequenceFile.Reader(dfs, path, conf);
BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
File filePathTo = new File(TEST_SOURCE_DIR);
BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+
while (reader.next(key, value)) {
bw.write(Kmer.recoverKmerFrom(SIZE_KMER, key.getBytes(), 0, key.getLength()) + "\t" + value.toString());
bw.newLine();
@@ -82,7 +84,7 @@
bw.close();
dumpResult();
-// TestUtils.compareWithResult(new File(TEST_SOURCE_DIR), new File(EXPECTED_PATH));
+ TestUtils.compareWithResult(new File(TEST_SOURCE_DIR), new File(EXPECTED_PATH));
cleanupHadoop();
@@ -114,7 +116,7 @@
private void dumpResult() throws IOException {
Path src = new Path(RESULT_PATH);
- Path dest = new Path(ACTUAL_RESULT_DIR + "/");
+ Path dest = new Path(ACTUAL_RESULT_DIR);
dfs.copyToLocalFile(src, dest);
}
}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/utils/TestUtils.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/utils/TestUtils.java
index 237a764..015017a 100755
--- a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/utils/TestUtils.java
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/utils/TestUtils.java
@@ -71,7 +71,5 @@
public static void main(String[] args) throws Exception {
TestUtils TUtils = new TestUtils();
- TUtils.compareWithResult(new File("/Users/hadoop/Documents/workspace/Test/part-00000"), new File(
- "/Users/hadoop/Documents/workspace/Test/test.txt"));
}
}