split the genomix-core into genomix-core and  genomix-hyracks

git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_genomix@3029 123451ca-8445-de46-9d55-352943316053
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
new file mode 100644
index 0000000..3a37087
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
@@ -0,0 +1,188 @@
+package edu.uci.ics.genomix.example.jobrun;
+
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import junit.framework.Assert;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.driver.Driver;
+import edu.uci.ics.genomix.driver.Driver.Plan;
+import edu.uci.ics.genomix.job.GenomixJob;
+import edu.uci.ics.genomix.type.Kmer;
+import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.hyracks.hdfs.utils.HyracksUtils;
+import edu.uci.ics.hyracks.hdfs.utils.TestUtils;
+
+public class JobRunTest {
+	private static final String ACTUAL_RESULT_DIR = "actual";
+	private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf";
+
+	private static final String DATA_PATH = "src/test/resources/data/webmap/text.txt";
+	private static final String HDFS_INPUT_PATH = "/webmap";
+	private static final String HDFS_OUTPUT_PATH = "/webmap_result";
+	private static final String HDFS_OUTPUT_FILE = HDFS_OUTPUT_PATH + "/part-0";
+
+	private static final String DUMPED_RESULT = ACTUAL_RESULT_DIR
+			+ HDFS_OUTPUT_PATH + "/merged.txt";
+	private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
+	private static final String EXPECTED_PATH = "src/test/resources/expected/result2";
+
+	private static final String HYRACKS_APP_NAME = "genomix";
+	private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR
+			+ File.separator + "conf.xml";
+	private MiniDFSCluster dfsCluster;
+
+	private JobConf conf = new JobConf();
+	private int numberOfNC = 2;
+	private int numPartitionPerMachine = 2;
+
+	private Driver driver;
+
+	@Before
+	public void setUp() throws Exception {
+		cleanupStores();
+		HyracksUtils.init();
+		HyracksUtils.createApp(HYRACKS_APP_NAME);
+		FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+		FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+		startHDFS();
+
+		FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
+		FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
+
+		conf.setInt(GenomixJob.KMER_LENGTH, 5);
+		driver = new Driver(HyracksUtils.CC_HOST,
+				HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT,
+				numPartitionPerMachine);
+	}
+
+	private void cleanupStores() throws IOException {
+		FileUtils.forceMkdir(new File("teststore"));
+		FileUtils.forceMkdir(new File("build"));
+		FileUtils.cleanDirectory(new File("teststore"));
+		FileUtils.cleanDirectory(new File("build"));
+	}
+
+	private void startHDFS() throws IOException {
+		conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
+		conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
+		conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
+
+		FileSystem lfs = FileSystem.getLocal(new Configuration());
+		lfs.delete(new Path("build"), true);
+		System.setProperty("hadoop.log.dir", "logs");
+		dfsCluster = new MiniDFSCluster(conf, numberOfNC, true, null);
+		FileSystem dfs = FileSystem.get(conf);
+		Path src = new Path(DATA_PATH);
+		Path dest = new Path(HDFS_INPUT_PATH);
+		Path result = new Path(HDFS_OUTPUT_PATH);
+		dfs.mkdirs(dest);
+		dfs.mkdirs(result);
+		dfs.copyFromLocalFile(src, dest);
+
+		DataOutputStream confOutput = new DataOutputStream(
+				new FileOutputStream(new File(HADOOP_CONF_PATH)));
+		conf.writeXml(confOutput);
+		confOutput.flush();
+		confOutput.close();
+	}
+
+	private void cleanUpReEntry() throws IOException {
+		FileSystem lfs = FileSystem.getLocal(new Configuration());
+		if (lfs.exists(new Path(DUMPED_RESULT))) {
+			lfs.delete(new Path(DUMPED_RESULT), true);
+		}
+		FileSystem dfs = FileSystem.get(conf);
+		if (dfs.exists(new Path(HDFS_OUTPUT_PATH))) {
+			dfs.delete(new Path(HDFS_OUTPUT_PATH), true);
+		}
+	}
+
+	@Test
+	public void TestExternalGroupby() throws Exception {
+		conf.set(GenomixJob.GROUPBY_TYPE, "external");
+		conf.set(GenomixJob.OUTPUT_FORMAT, "text");
+		System.err.println("Testing ExternalGroupBy");
+		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+		Assert.assertEquals(true, checkResults());
+	}
+
+	//@Test
+	public void TestPreClusterGroupby() throws Exception {
+		conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+		conf.set(GenomixJob.OUTPUT_FORMAT, "text");
+		System.err.println("Testing PreClusterGroupBy");
+		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+		Assert.assertEquals(true, checkResults());
+	}
+
+	@Test
+	public void TestHybridGroupby() throws Exception {
+		conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+		conf.set(GenomixJob.OUTPUT_FORMAT, "text");
+		System.err.println("Testing HybridGroupBy");
+		driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+		Assert.assertEquals(true, checkResults());
+	}
+
+	private boolean checkResults() throws Exception {
+		FileUtil.copyMerge(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH),
+				FileSystem.getLocal(new Configuration()), new Path(
+						DUMPED_RESULT), false, conf, null);
+		File dumped = new File( DUMPED_RESULT);
+		String format = conf.get(GenomixJob.OUTPUT_FORMAT); 
+		if( !"text".equalsIgnoreCase(format)){
+	        SequenceFile.Reader reader = null;
+	        Path path = new Path(HDFS_OUTPUT_FILE);
+	        FileSystem dfs = FileSystem.get(conf);
+	        reader = new SequenceFile.Reader(dfs, path, conf);
+	        BytesWritable key = (BytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+	        KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+	        File filePathTo = new File(CONVERT_RESULT);
+	        BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+	        int k = conf.getInt(GenomixJob.KMER_LENGTH, 25);
+	        while (reader.next(key, value)) {
+	            bw.write(Kmer.recoverKmerFrom(k, key.getBytes(), 0, key.getLength()) + "\t" + value.toString());
+	            bw.newLine();
+	        }
+	        bw.close();
+	        dumped = new File(CONVERT_RESULT);
+		}
+        
+		TestUtils.compareWithSortedResult(new File(EXPECTED_PATH), dumped);
+		return true;
+	}
+
+	@After
+	public void tearDown() throws Exception {
+		HyracksUtils.destroyApp(HYRACKS_APP_NAME);
+		HyracksUtils.deinit();
+		cleanupHDFS();
+	}
+
+	private void cleanupHDFS() throws Exception {
+		dfsCluster.shutdown();
+	}
+
+}
diff --git a/genomix/genomix-hyracks/src/test/resources/data/0/text.txt b/genomix/genomix-hyracks/src/test/resources/data/0/text.txt
new file mode 100755
index 0000000..f63a141
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/data/0/text.txt
@@ -0,0 +1,4 @@
+@625E1AAXX100810:1:100:10000:10271/1

+AATAGAAG

++

+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?

diff --git a/genomix/genomix-hyracks/src/test/resources/data/webmap/text.txt b/genomix/genomix-hyracks/src/test/resources/data/webmap/text.txt
new file mode 100755
index 0000000..f63a141
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/data/webmap/text.txt
@@ -0,0 +1,4 @@
+@625E1AAXX100810:1:100:10000:10271/1

+AATAGAAG

++

+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?

diff --git a/genomix/genomix-hyracks/src/test/resources/expected/result2 b/genomix/genomix-hyracks/src/test/resources/expected/result2
new file mode 100755
index 0000000..5e76458
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/expected/result2
@@ -0,0 +1,4 @@
+AATAG	|A	1
+AGAAG	T|	1
+ATAGA	A|A	1
+TAGAA	A|G	1
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/core-site.xml b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/core-site.xml
new file mode 100644
index 0000000..3e5bacb
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/core-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+	<property>
+		<name>fs.default.name</name>
+		<value>hdfs://127.0.0.1:31888</value>
+	</property>
+	<property>
+		<name>hadoop.tmp.dir</name>
+		<value>/tmp/hadoop</value>
+	</property>
+
+
+</configuration>
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/hdfs-site.xml b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/hdfs-site.xml
new file mode 100644
index 0000000..b1b1902
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/hdfs-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+	<property>
+		<name>dfs.replication</name>
+		<value>1</value>
+	</property>
+
+	<property>
+		<name>dfs.block.size</name>
+		<value>65536</value>
+	</property>
+
+</configuration>
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/log4j.properties b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/log4j.properties
new file mode 100755
index 0000000..d5e6004
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/log4j.properties
@@ -0,0 +1,94 @@
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=FATAL,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+
+# Logging Threshold
+log4j.threshhold=FATAL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this 
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Rolling File Appender
+#
+
+#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
+#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Logfile size and and 30-day backups
+#log4j.appender.RFA.MaxFileSize=1MB
+#log4j.appender.RFA.MaxBackupIndex=30
+
+#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+#
+# FSNamesystem Audit logging
+# All audit events are logged at INFO level
+#
+log4j.logger.org.apache.hadoop.fs.FSNamesystem.audit=WARN
+
+# Custom Logging levels
+
+#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
+#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
+#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG
+
+# Jets3t library
+log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/mapred-site.xml b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/mapred-site.xml
new file mode 100644
index 0000000..525e7d5
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/mapred-site.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+	<property>
+		<name>mapred.job.tracker</name>
+		<value>localhost:29007</value>
+	</property>
+	<property>
+		<name>mapred.tasktracker.map.tasks.maximum</name>
+		<value>20</value>
+	</property>
+	<property>
+		<name>mapred.tasktracker.reduce.tasks.maximum</name>
+		<value>20</value>
+	</property>
+	<property>
+		<name>mapred.max.split.size</name>
+		<value>2048</value>
+	</property>
+
+</configuration>