Merged fullstack_asterix_stabilization -r 2933:3157
git-svn-id: https://hyracks.googlecode.com/svn/branches/fullstack_hyracks_ioc@3164 123451ca-8445-de46-9d55-352943316053
diff --git a/hivesterix/hivesterix-dist/src/main/assembly/binary-assembly.xml b/hivesterix/hivesterix-dist/src/main/assembly/binary-assembly.xml
new file mode 100755
index 0000000..de3757f
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/assembly/binary-assembly.xml
@@ -0,0 +1,26 @@
+<assembly>
+ <id>binary-assembly</id>
+ <formats>
+ <format>zip</format>
+ <format>dir</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <fileSets>
+ <fileSet>
+ <directory>target/appassembler/bin</directory>
+ <outputDirectory>bin</outputDirectory>
+ <fileMode>0755</fileMode>
+ </fileSet>
+ <fileSet>
+ <directory>target/appassembler/lib</directory>
+ <outputDirectory>lib</outputDirectory>
+ </fileSet>
+ <fileSet>
+ <directory>target</directory>
+ <outputDirectory>lib</outputDirectory>
+ <includes>
+ <include>*.jar</include>
+ </includes>
+ </fileSet>
+ </fileSets>
+</assembly>
diff --git a/hivesterix/hivesterix-dist/src/main/java/edu/uci/ics/hivesterix/runtime/exec/HyracksExecutionEngine.java b/hivesterix/hivesterix-dist/src/main/java/edu/uci/ics/hivesterix/runtime/exec/HyracksExecutionEngine.java
new file mode 100644
index 0000000..e075f09
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/edu/uci/ics/hivesterix/runtime/exec/HyracksExecutionEngine.java
@@ -0,0 +1,595 @@
+package edu.uci.ics.hivesterix.runtime.exec;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.io.Serializable;
+import java.net.InetAddress;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.ConditionalTask;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
+import org.apache.hadoop.hive.ql.plan.FetchWork;
+import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
+import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
+import org.apache.hadoop.hive.ql.plan.MapredWork;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+
+import edu.uci.ics.hivesterix.common.config.ConfUtil;
+import edu.uci.ics.hivesterix.logical.expression.HiveExpressionTypeComputer;
+import edu.uci.ics.hivesterix.logical.expression.HiveMergeAggregationExpressionFactory;
+import edu.uci.ics.hivesterix.logical.expression.HiveNullableTypeComputer;
+import edu.uci.ics.hivesterix.logical.expression.HivePartialAggregationTypeComputer;
+import edu.uci.ics.hivesterix.logical.plan.HiveAlgebricksTranslator;
+import edu.uci.ics.hivesterix.logical.plan.HiveLogicalPlanAndMetaData;
+import edu.uci.ics.hivesterix.optimizer.rulecollections.HiveRuleCollections;
+import edu.uci.ics.hivesterix.runtime.factory.evaluator.HiveExpressionRuntimeProvider;
+import edu.uci.ics.hivesterix.runtime.factory.nullwriter.HiveNullWriterFactory;
+import edu.uci.ics.hivesterix.runtime.inspector.HiveBinaryBooleanInspectorFactory;
+import edu.uci.ics.hivesterix.runtime.inspector.HiveBinaryIntegerInspectorFactory;
+import edu.uci.ics.hivesterix.runtime.jobgen.HiveConnectorPolicyAssignmentPolicy;
+import edu.uci.ics.hivesterix.runtime.jobgen.HiveConnectorPolicyAssignmentPolicy.Policy;
+import edu.uci.ics.hivesterix.runtime.provider.HiveBinaryComparatorFactoryProvider;
+import edu.uci.ics.hivesterix.runtime.provider.HiveBinaryHashFunctionFactoryProvider;
+import edu.uci.ics.hivesterix.runtime.provider.HiveBinaryHashFunctionFamilyProvider;
+import edu.uci.ics.hivesterix.runtime.provider.HiveNormalizedKeyComputerFactoryProvider;
+import edu.uci.ics.hivesterix.runtime.provider.HivePrinterFactoryProvider;
+import edu.uci.ics.hivesterix.runtime.provider.HiveSerializerDeserializerProvider;
+import edu.uci.ics.hivesterix.runtime.provider.HiveTypeTraitProvider;
+import edu.uci.ics.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint;
+import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
+import edu.uci.ics.hyracks.algebricks.common.utils.Pair;
+import edu.uci.ics.hyracks.algebricks.compiler.api.HeuristicCompilerFactoryBuilder;
+import edu.uci.ics.hyracks.algebricks.compiler.api.HeuristicCompilerFactoryBuilder.DefaultOptimizationContextFactory;
+import edu.uci.ics.hyracks.algebricks.compiler.api.ICompiler;
+import edu.uci.ics.hyracks.algebricks.compiler.api.ICompilerFactory;
+import edu.uci.ics.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialFixpointRuleController;
+import edu.uci.ics.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialOnceRuleController;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.ILogicalPlan;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.ILogicalPlanAndMetadata;
+import edu.uci.ics.hyracks.algebricks.core.algebra.prettyprint.LogicalOperatorPrettyPrintVisitor;
+import edu.uci.ics.hyracks.algebricks.core.algebra.prettyprint.PlanPrettyPrinter;
+import edu.uci.ics.hyracks.algebricks.core.rewriter.base.AbstractRuleController;
+import edu.uci.ics.hyracks.algebricks.core.rewriter.base.IAlgebraicRewriteRule;
+import edu.uci.ics.hyracks.algebricks.core.rewriter.base.PhysicalOptimizationConfig;
+import edu.uci.ics.hyracks.api.client.HyracksConnection;
+import edu.uci.ics.hyracks.api.client.IHyracksClientConnection;
+import edu.uci.ics.hyracks.api.job.JobId;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+
+@SuppressWarnings({ "rawtypes", "unchecked" })
+public class HyracksExecutionEngine implements IExecutionEngine {
+
+ private static final Log LOG = LogFactory.getLog(HyracksExecutionEngine.class.getName());
+ private static final String clusterPropertiesPath = "conf/cluster.properties";
+ private static final String masterFilePath = "conf/master";
+
+ private static List<Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>> DEFAULT_LOGICAL_REWRITES = new ArrayList<Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>>();
+ private static List<Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>> DEFAULT_PHYSICAL_REWRITES = new ArrayList<Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>>();
+ static {
+ SequentialFixpointRuleController seqCtrlNoDfs = new SequentialFixpointRuleController(false);
+ SequentialFixpointRuleController seqCtrlFullDfs = new SequentialFixpointRuleController(true);
+ SequentialOnceRuleController seqOnceCtrl = new SequentialOnceRuleController(true);
+ DEFAULT_LOGICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqCtrlFullDfs,
+ HiveRuleCollections.NORMALIZATION));
+ DEFAULT_LOGICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqCtrlNoDfs,
+ HiveRuleCollections.COND_PUSHDOWN_AND_JOIN_INFERENCE));
+ DEFAULT_LOGICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqCtrlFullDfs,
+ HiveRuleCollections.LOAD_FIELDS));
+ DEFAULT_LOGICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqCtrlNoDfs,
+ HiveRuleCollections.OP_PUSHDOWN));
+ DEFAULT_LOGICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqOnceCtrl,
+ HiveRuleCollections.DATA_EXCHANGE));
+ DEFAULT_LOGICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqCtrlNoDfs,
+ HiveRuleCollections.CONSOLIDATION));
+
+ DEFAULT_PHYSICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqOnceCtrl,
+ HiveRuleCollections.PHYSICAL_PLAN_REWRITES));
+ DEFAULT_PHYSICAL_REWRITES.add(new Pair<AbstractRuleController, List<IAlgebraicRewriteRule>>(seqOnceCtrl,
+ HiveRuleCollections.prepareJobGenRules));
+ }
+
+ /**
+ * static configurations for compiler
+ */
+ private HeuristicCompilerFactoryBuilder builder;
+
+ /**
+ * compiler
+ */
+ private ICompiler compiler;
+
+ /**
+ * physical optimization config
+ */
+ private PhysicalOptimizationConfig physicalOptimizationConfig;
+
+ /**
+ * final ending operators
+ */
+ private List<Operator> leaveOps = new ArrayList<Operator>();
+
+ /**
+ * tasks that are already visited
+ */
+ private Map<Task<? extends Serializable>, Boolean> tasksVisited = new HashMap<Task<? extends Serializable>, Boolean>();
+
+ /**
+ * hyracks job spec
+ */
+ private JobSpecification jobSpec;
+
+ /**
+ * hive configuration
+ */
+ private HiveConf conf;
+
+ /**
+ * plan printer
+ */
+ private PrintWriter planPrinter;
+
+ /**
+ * properties
+ */
+ private Properties clusterProps;
+
+ /**
+ * the Hyracks client connection
+ */
+ private IHyracksClientConnection hcc;
+
+ public HyracksExecutionEngine(HiveConf conf) {
+ this.conf = conf;
+ init(conf);
+ }
+
+ public HyracksExecutionEngine(HiveConf conf, PrintWriter planPrinter) {
+ this.conf = conf;
+ this.planPrinter = planPrinter;
+ init(conf);
+ }
+
+ private void init(HiveConf conf) {
+ builder = new HeuristicCompilerFactoryBuilder(DefaultOptimizationContextFactory.INSTANCE);
+ builder.setLogicalRewrites(DEFAULT_LOGICAL_REWRITES);
+ builder.setPhysicalRewrites(DEFAULT_PHYSICAL_REWRITES);
+ builder.setIMergeAggregationExpressionFactory(HiveMergeAggregationExpressionFactory.INSTANCE);
+ builder.setExpressionTypeComputer(HiveExpressionTypeComputer.INSTANCE);
+ builder.setNullableTypeComputer(HiveNullableTypeComputer.INSTANCE);
+
+ long memSizeExternalGby = conf.getLong("hive.algebricks.groupby.external.memory", 268435456);
+ long memSizeExternalSort = conf.getLong("hive.algebricks.sort.memory", 536870912);
+ int frameSize = conf.getInt("hive.algebricks.framesize", 32768);
+
+ physicalOptimizationConfig = new PhysicalOptimizationConfig();
+ int frameLimitExtGby = (int) (memSizeExternalGby / frameSize);
+ physicalOptimizationConfig.setMaxFramesExternalGroupBy(frameLimitExtGby);
+ int frameLimitExtSort = (int) (memSizeExternalSort / frameSize);
+ physicalOptimizationConfig.setMaxFramesExternalSort(frameLimitExtSort);
+ builder.setPhysicalOptimizationConfig(physicalOptimizationConfig);
+ }
+
+ @Override
+ public int compileJob(List<Task<? extends Serializable>> rootTasks) {
+ // clean up
+ leaveOps.clear();
+ tasksVisited.clear();
+ jobSpec = null;
+
+ HashMap<String, PartitionDesc> aliasToPath = new HashMap<String, PartitionDesc>();
+ List<Operator> rootOps = generateRootOperatorDAG(rootTasks, aliasToPath);
+
+ // get all leave Ops
+ getLeaves(rootOps, leaveOps);
+
+ HiveAlgebricksTranslator translator = new HiveAlgebricksTranslator();
+ try {
+ translator.translate(rootOps, null, aliasToPath);
+
+ ILogicalPlan plan = translator.genLogicalPlan();
+
+ if (plan.getRoots() != null && plan.getRoots().size() > 0 && plan.getRoots().get(0).getValue() != null) {
+ translator.printOperators();
+ ILogicalPlanAndMetadata planAndMetadata = new HiveLogicalPlanAndMetaData(plan,
+ translator.getMetadataProvider());
+
+ ICompilerFactory compilerFactory = builder.create();
+ compiler = compilerFactory.createCompiler(planAndMetadata.getPlan(),
+ planAndMetadata.getMetadataProvider(), translator.getVariableCounter());
+
+ // run optimization and re-writing rules for Hive plan
+ compiler.optimize();
+
+ // print optimized plan
+ LogicalOperatorPrettyPrintVisitor pvisitor = new LogicalOperatorPrettyPrintVisitor();
+ StringBuilder buffer = new StringBuilder();
+ PlanPrettyPrinter.printPlan(plan, buffer, pvisitor, 0);
+ String planStr = buffer.toString();
+ System.out.println(planStr);
+
+ if (planPrinter != null)
+ planPrinter.print(planStr);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ return 1;
+ }
+
+ return 0;
+ }
+
+ private void codeGen() throws AlgebricksException {
+ try {
+ // number of cpu cores in the cluster
+ builder.setClusterLocations(new AlgebricksAbsolutePartitionConstraint(ConfUtil.getNCs()));
+ } catch (Exception e) {
+ throw new AlgebricksException(e);
+ }
+ // builder.setClusterTopology(ConfUtil.getClusterTopology());
+ builder.setBinaryBooleanInspectorFactory(HiveBinaryBooleanInspectorFactory.INSTANCE);
+ builder.setBinaryIntegerInspectorFactory(HiveBinaryIntegerInspectorFactory.INSTANCE);
+ builder.setComparatorFactoryProvider(HiveBinaryComparatorFactoryProvider.INSTANCE);
+ builder.setExpressionRuntimeProvider(HiveExpressionRuntimeProvider.INSTANCE);
+ builder.setHashFunctionFactoryProvider(HiveBinaryHashFunctionFactoryProvider.INSTANCE);
+ builder.setPrinterProvider(HivePrinterFactoryProvider.INSTANCE);
+ builder.setSerializerDeserializerProvider(HiveSerializerDeserializerProvider.INSTANCE);
+ builder.setNullWriterFactory(HiveNullWriterFactory.INSTANCE);
+ builder.setNormalizedKeyComputerFactoryProvider(HiveNormalizedKeyComputerFactoryProvider.INSTANCE);
+ builder.setPartialAggregationTypeComputer(HivePartialAggregationTypeComputer.INSTANCE);
+ builder.setTypeTraitProvider(HiveTypeTraitProvider.INSTANCE);
+ builder.setHashFunctionFamilyProvider(HiveBinaryHashFunctionFamilyProvider.INSTANCE);
+
+ jobSpec = compiler.createJob(null);
+
+ // set the policy
+ String policyStr = conf.get("hive.hyracks.connectorpolicy");
+ if (policyStr == null)
+ policyStr = "PIPELINING";
+ Policy policyValue = Policy.valueOf(policyStr);
+ jobSpec.setConnectorPolicyAssignmentPolicy(new HiveConnectorPolicyAssignmentPolicy(policyValue));
+ jobSpec.setUseConnectorPolicyForScheduling(false);
+ }
+
+ @Override
+ public int executeJob() {
+ try {
+ codeGen();
+ executeHyracksJob(jobSpec);
+ } catch (Exception e) {
+ e.printStackTrace();
+ return 1;
+ }
+ return 0;
+ }
+
+ private List<Operator> generateRootOperatorDAG(List<Task<? extends Serializable>> rootTasks,
+ HashMap<String, PartitionDesc> aliasToPath) {
+
+ List<Operator> rootOps = new ArrayList<Operator>();
+ List<Task<? extends Serializable>> toDelete = new ArrayList<Task<? extends Serializable>>();
+ tasksVisited.clear();
+
+ for (int i = rootTasks.size() - 1; i >= 0; i--) {
+ /**
+ * list of map-reduce tasks
+ */
+ Task<? extends Serializable> task = rootTasks.get(i);
+
+ if (task instanceof MapRedTask) {
+ List<Operator> mapRootOps = articulateMapReduceOperators(task, rootOps, aliasToPath, rootTasks);
+ if (i == 0)
+ rootOps.addAll(mapRootOps);
+ else {
+ List<Operator> leaves = new ArrayList<Operator>();
+ getLeaves(rootOps, leaves);
+
+ List<Operator> mapChildren = new ArrayList<Operator>();
+ for (Operator childMap : mapRootOps) {
+ if (childMap instanceof TableScanOperator) {
+ TableScanDesc topDesc = (TableScanDesc) childMap.getConf();
+ if (topDesc == null)
+ mapChildren.add(childMap);
+ else {
+ rootOps.add(childMap);
+ }
+ } else
+ mapChildren.add(childMap);
+ }
+
+ if (mapChildren.size() > 0) {
+ for (Operator leaf : leaves)
+ leaf.setChildOperators(mapChildren);
+ for (Operator child : mapChildren)
+ child.setParentOperators(leaves);
+ }
+ }
+
+ MapredWork mr = (MapredWork) task.getWork();
+ HashMap<String, PartitionDesc> map = mr.getAliasToPartnInfo();
+
+ addAliasToPartition(aliasToPath, map);
+ toDelete.add(task);
+ }
+ }
+
+ for (Task<? extends Serializable> task : toDelete)
+ rootTasks.remove(task);
+
+ return rootOps;
+ }
+
+ private void addAliasToPartition(HashMap<String, PartitionDesc> aliasToPath, HashMap<String, PartitionDesc> map) {
+ Iterator<String> keys = map.keySet().iterator();
+ while (keys.hasNext()) {
+ String key = keys.next();
+ PartitionDesc part = map.get(key);
+ String[] names = key.split(":");
+ for (String name : names) {
+ aliasToPath.put(name, part);
+ }
+ }
+ }
+
+ private List<Operator> articulateMapReduceOperators(Task task, List<Operator> rootOps,
+ HashMap<String, PartitionDesc> aliasToPath, List<Task<? extends Serializable>> rootTasks) {
+ // System.out.println("!"+task.getName());
+ if (!(task instanceof MapRedTask)) {
+ if (!(task instanceof ConditionalTask)) {
+ rootTasks.add(task);
+ return null;
+ } else {
+ // remove map-reduce branches in condition task
+ ConditionalTask condition = (ConditionalTask) task;
+ List<Task<? extends Serializable>> branches = condition.getListTasks();
+ for (int i = branches.size() - 1; i >= 0; i--) {
+ Task branch = branches.get(i);
+ if (branch instanceof MapRedTask) {
+ return articulateMapReduceOperators(branch, rootOps, aliasToPath, rootTasks);
+ }
+ }
+ rootTasks.add(task);
+ return null;
+ }
+ }
+
+ MapredWork mr = (MapredWork) task.getWork();
+ HashMap<String, PartitionDesc> map = mr.getAliasToPartnInfo();
+
+ // put all aliasToParitionDesc mapping into the map
+ addAliasToPartition(aliasToPath, map);
+
+ MapRedTask mrtask = (MapRedTask) task;
+ MapredWork work = (MapredWork) mrtask.getWork();
+ HashMap<String, Operator<? extends Serializable>> operators = work.getAliasToWork();
+
+ Set entries = operators.entrySet();
+ Iterator<Entry<String, Operator>> iterator = entries.iterator();
+ List<Operator> mapRootOps = new ArrayList<Operator>();
+
+ // get map root operators
+ while (iterator.hasNext()) {
+ Operator next = iterator.next().getValue();
+ if (!mapRootOps.contains(next)) {
+ // clear that only for the case of union
+ mapRootOps.add(next);
+ }
+ }
+
+ // get map local work
+ MapredLocalWork localWork = work.getMapLocalWork();
+ if (localWork != null) {
+ HashMap<String, Operator<? extends Serializable>> localOperators = localWork.getAliasToWork();
+
+ Set localEntries = localOperators.entrySet();
+ Iterator<Entry<String, Operator>> localIterator = localEntries.iterator();
+ while (localIterator.hasNext()) {
+ mapRootOps.add(localIterator.next().getValue());
+ }
+
+ HashMap<String, FetchWork> localFetch = localWork.getAliasToFetchWork();
+ Set localFetchEntries = localFetch.entrySet();
+ Iterator<Entry<String, FetchWork>> localFetchIterator = localFetchEntries.iterator();
+ while (localFetchIterator.hasNext()) {
+ Entry<String, FetchWork> fetchMap = localFetchIterator.next();
+ FetchWork fetch = fetchMap.getValue();
+ String alias = fetchMap.getKey();
+ List<PartitionDesc> dirPart = fetch.getPartDesc();
+
+ // temporary hack: put the first partitionDesc into the map
+ aliasToPath.put(alias, dirPart.get(0));
+ }
+ }
+
+ Boolean visited = tasksVisited.get(task);
+ if (visited != null && visited.booleanValue() == true) {
+ return mapRootOps;
+ }
+
+ // do that only for union operator
+ for (Operator op : mapRootOps)
+ if (op.getParentOperators() != null)
+ op.getParentOperators().clear();
+
+ List<Operator> mapLeaves = new ArrayList<Operator>();
+ downToLeaves(mapRootOps, mapLeaves);
+ List<Operator> reduceOps = new ArrayList<Operator>();
+
+ if (work.getReducer() != null)
+ reduceOps.add(work.getReducer());
+
+ for (Operator mapLeaf : mapLeaves) {
+ mapLeaf.setChildOperators(reduceOps);
+ }
+
+ for (Operator reduceOp : reduceOps) {
+ if (reduceOp != null)
+ reduceOp.setParentOperators(mapLeaves);
+ }
+
+ List<Operator> leafs = new ArrayList<Operator>();
+ if (reduceOps.size() > 0) {
+ downToLeaves(reduceOps, leafs);
+ } else {
+ leafs = mapLeaves;
+ }
+
+ List<Operator> mapChildren = new ArrayList<Operator>();
+ if (task.getChildTasks() != null && task.getChildTasks().size() > 0) {
+ for (Object child : task.getChildTasks()) {
+ List<Operator> childMapOps = articulateMapReduceOperators((Task) child, rootOps, aliasToPath, rootTasks);
+ if (childMapOps == null)
+ continue;
+
+ for (Operator childMap : childMapOps) {
+ if (childMap instanceof TableScanOperator) {
+ TableScanDesc topDesc = (TableScanDesc) childMap.getConf();
+ if (topDesc == null)
+ mapChildren.add(childMap);
+ else {
+ rootOps.add(childMap);
+ }
+ } else {
+ // if not table scan, add the child
+ mapChildren.add(childMap);
+ }
+ }
+ }
+
+ if (mapChildren.size() > 0) {
+ int i = 0;
+ for (Operator leaf : leafs) {
+ if (leaf.getChildOperators() == null || leaf.getChildOperators().size() == 0)
+ leaf.setChildOperators(new ArrayList<Operator>());
+ leaf.getChildOperators().add(mapChildren.get(i));
+ i++;
+ }
+ i = 0;
+ for (Operator child : mapChildren) {
+ if (child.getParentOperators() == null || child.getParentOperators().size() == 0)
+ child.setParentOperators(new ArrayList<Operator>());
+ child.getParentOperators().add(leafs.get(i));
+ i++;
+ }
+ }
+ }
+
+ // mark this task as visited
+ this.tasksVisited.put(task, true);
+ return mapRootOps;
+ }
+
+ /**
+ * down to leaf nodes
+ *
+ * @param ops
+ * @param leaves
+ */
+ private void downToLeaves(List<Operator> ops, List<Operator> leaves) {
+
+ // Operator currentOp;
+ for (Operator op : ops) {
+ if (op != null && op.getChildOperators() != null && op.getChildOperators().size() > 0) {
+ downToLeaves(op.getChildOperators(), leaves);
+ } else {
+ if (op != null && leaves.indexOf(op) < 0)
+ leaves.add(op);
+ }
+ }
+ }
+
+ private void getLeaves(List<Operator> roots, List<Operator> currentLeaves) {
+ for (Operator op : roots) {
+ List<Operator> children = op.getChildOperators();
+ if (children == null || children.size() <= 0) {
+ currentLeaves.add(op);
+ } else {
+ getLeaves(children, currentLeaves);
+ }
+ }
+ }
+
+ private void executeHyracksJob(JobSpecification job) throws Exception {
+
+ /**
+ * load the properties file if it is not loaded
+ */
+ if (clusterProps == null) {
+ clusterProps = new Properties();
+ InputStream confIn = new FileInputStream(clusterPropertiesPath);
+ clusterProps.load(confIn);
+ confIn.close();
+ }
+
+ if (hcc == null) {
+ BufferedReader ipReader = new BufferedReader(new InputStreamReader(new FileInputStream(masterFilePath)));
+ String masterNode = ipReader.readLine();
+ ipReader.close();
+
+ InetAddress[] ips = InetAddress.getAllByName(masterNode);
+ int port = Integer.parseInt(clusterProps.getProperty("CC_CLIENTPORT"));
+ for (InetAddress ip : ips) {
+ if (ip.getAddress().length <= 4) {
+ try {
+ hcc = new HyracksConnection(ip.getHostAddress(), port);
+ break;
+ } catch (Exception e) {
+ continue;
+ }
+ }
+ }
+ }
+
+ long start = System.currentTimeMillis();
+ JobId jobId = hcc.startJob(job);
+ hcc.waitForCompletion(jobId);
+
+ // System.out.println("job finished: " + jobId.toString());
+ // call all leave nodes to end
+ for (Operator leaf : leaveOps) {
+ jobClose(leaf);
+ }
+
+ long end = System.currentTimeMillis();
+ System.err.println(start + " " + end + " " + (end - start));
+ }
+
+ /**
+ * mv to final directory on hdfs (not real final)
+ *
+ * @param leaf
+ * @throws Exception
+ */
+ private void jobClose(Operator leaf) throws Exception {
+ FileSinkOperator fsOp = (FileSinkOperator) leaf;
+ FileSinkDesc desc = fsOp.getConf();
+ boolean isNativeTable = !desc.getTableInfo().isNonNative();
+ if ((conf != null) && isNativeTable) {
+ String specPath = desc.getDirName();
+ DynamicPartitionCtx dpCtx = desc.getDynPartCtx();
+ // for 0.7.0
+ fsOp.mvFileToFinalPath(specPath, conf, true, LOG, dpCtx);
+ // for 0.8.0
+ // Utilities.mvFileToFinalPath(specPath, conf, true, LOG, dpCtx,
+ // desc);
+ }
+ }
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/edu/uci/ics/hivesterix/runtime/exec/IExecutionEngine.java b/hivesterix/hivesterix-dist/src/main/java/edu/uci/ics/hivesterix/runtime/exec/IExecutionEngine.java
new file mode 100644
index 0000000..c64a39b
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/edu/uci/ics/hivesterix/runtime/exec/IExecutionEngine.java
@@ -0,0 +1,25 @@
+package edu.uci.ics.hivesterix.runtime.exec;
+
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.Task;
+
+public interface IExecutionEngine {
+
+ /**
+ * compile the job
+ *
+ * @param rootTasks
+ * : Hive MapReduce plan
+ * @return 0 pass, 1 fail
+ */
+ public int compileJob(List<Task<? extends Serializable>> rootTasks);
+
+ /**
+ * execute the job with latest compiled plan
+ *
+ * @return
+ */
+ public int executeJob();
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/Driver.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/Driver.java
new file mode 100644
index 0000000..a385742
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/Driver.java
@@ -0,0 +1,1310 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql;
+
+import java.io.DataInput;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.JavaUtils;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.MetaStoreUtils;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.Schema;
+import org.apache.hadoop.hive.ql.exec.ConditionalTask;
+import org.apache.hadoop.hive.ql.exec.ExecDriver;
+import org.apache.hadoop.hive.ql.exec.FetchTask;
+import org.apache.hadoop.hive.ql.exec.MapRedTask;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.StatsTask;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.TaskFactory;
+import org.apache.hadoop.hive.ql.exec.TaskResult;
+import org.apache.hadoop.hive.ql.exec.TaskRunner;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.history.HiveHistory.Keys;
+import org.apache.hadoop.hive.ql.hooks.ExecuteWithHookContext;
+import org.apache.hadoop.hive.ql.hooks.Hook;
+import org.apache.hadoop.hive.ql.hooks.HookContext;
+import org.apache.hadoop.hive.ql.hooks.PostExecute;
+import org.apache.hadoop.hive.ql.hooks.PreExecute;
+import org.apache.hadoop.hive.ql.hooks.ReadEntity;
+import org.apache.hadoop.hive.ql.hooks.WriteEntity;
+import org.apache.hadoop.hive.ql.lockmgr.HiveLock;
+import org.apache.hadoop.hive.ql.lockmgr.HiveLockManager;
+import org.apache.hadoop.hive.ql.lockmgr.HiveLockManagerCtx;
+import org.apache.hadoop.hive.ql.lockmgr.HiveLockMode;
+import org.apache.hadoop.hive.ql.lockmgr.HiveLockObj;
+import org.apache.hadoop.hive.ql.lockmgr.HiveLockObject;
+import org.apache.hadoop.hive.ql.lockmgr.HiveLockObject.HiveLockObjectData;
+import org.apache.hadoop.hive.ql.lockmgr.LockException;
+import org.apache.hadoop.hive.ql.metadata.AuthorizationException;
+import org.apache.hadoop.hive.ql.metadata.DummyPartition;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.AbstractSemanticAnalyzerHook;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.ErrorMsg;
+import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContext;
+import org.apache.hadoop.hive.ql.parse.HiveSemanticAnalyzerHookContextImpl;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.ParseDriver;
+import org.apache.hadoop.hive.ql.parse.ParseException;
+import org.apache.hadoop.hive.ql.parse.ParseUtils;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.VariableSubstitution;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolver;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles;
+import org.apache.hadoop.hive.ql.plan.HiveOperation;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.ql.processors.CommandProcessor;
+import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
+import org.apache.hadoop.hive.serde2.ByteStream;
+import org.apache.hadoop.hive.shims.ShimLoader;
+import org.apache.hadoop.mapred.ClusterStatus;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import edu.uci.ics.hivesterix.runtime.exec.HyracksExecutionEngine;
+import edu.uci.ics.hivesterix.runtime.exec.IExecutionEngine;
+
+@SuppressWarnings({ "deprecation", "unused" })
+public class Driver implements CommandProcessor {
+
+ static final private Log LOG = LogFactory.getLog(Driver.class.getName());
+ static final private LogHelper console = new LogHelper(LOG);
+
+ // hive-sterix
+ private IExecutionEngine engine;
+ private boolean hivesterix = false;
+
+ private int maxRows = 100;
+ ByteStream.Output bos = new ByteStream.Output();
+
+ private HiveConf conf;
+ private DataInput resStream;
+ private Context ctx;
+ private QueryPlan plan;
+ private Schema schema;
+ private HiveLockManager hiveLockMgr;
+
+ private String errorMessage;
+ private String SQLState;
+
+ // A limit on the number of threads that can be launched
+ private int maxthreads;
+ private final int sleeptime = 2000;
+
+ protected int tryCount = Integer.MAX_VALUE;
+
+ private int checkLockManager() {
+ boolean supportConcurrency = conf.getBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY);
+ if (supportConcurrency && (hiveLockMgr == null)) {
+ try {
+ setLockManager();
+ } catch (SemanticException e) {
+ errorMessage = "FAILED: Error in semantic analysis: " + e.getMessage();
+ SQLState = ErrorMsg.findSQLState(e.getMessage());
+ console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return (12);
+ }
+ }
+ return (0);
+ }
+
+ private void setLockManager() throws SemanticException {
+ boolean supportConcurrency = conf.getBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY);
+ if (supportConcurrency) {
+ String lockMgr = conf.getVar(HiveConf.ConfVars.HIVE_LOCK_MANAGER);
+ if ((lockMgr == null) || (lockMgr.isEmpty())) {
+ throw new SemanticException(ErrorMsg.LOCKMGR_NOT_SPECIFIED.getMsg());
+ }
+
+ try {
+ hiveLockMgr = (HiveLockManager) ReflectionUtils.newInstance(conf.getClassByName(lockMgr), conf);
+ hiveLockMgr.setContext(new HiveLockManagerCtx(conf));
+ } catch (Exception e) {
+ throw new SemanticException(ErrorMsg.LOCKMGR_NOT_INITIALIZED.getMsg() + e.getMessage());
+ }
+ }
+ }
+
+ public void init() {
+ Operator.resetId();
+ }
+
+ /**
+ * Return the status information about the Map-Reduce cluster
+ */
+ public ClusterStatus getClusterStatus() throws Exception {
+ ClusterStatus cs;
+ try {
+ JobConf job = new JobConf(conf, ExecDriver.class);
+ JobClient jc = new JobClient(job);
+ cs = jc.getClusterStatus();
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw e;
+ }
+ LOG.info("Returning cluster status: " + cs.toString());
+ return cs;
+ }
+
+ public Schema getSchema() {
+ return schema;
+ }
+
+ /**
+ * Get a Schema with fields represented with native Hive types
+ */
+ public static Schema getSchema(BaseSemanticAnalyzer sem, HiveConf conf) {
+ Schema schema = null;
+
+ // If we have a plan, prefer its logical result schema if it's
+ // available; otherwise, try digging out a fetch task; failing that,
+ // give up.
+ if (sem == null) {
+ // can't get any info without a plan
+ } else if (sem.getResultSchema() != null) {
+ List<FieldSchema> lst = sem.getResultSchema();
+ schema = new Schema(lst, null);
+ } else if (sem.getFetchTask() != null) {
+ FetchTask ft = sem.getFetchTask();
+ TableDesc td = ft.getTblDesc();
+ // partitioned tables don't have tableDesc set on the FetchTask.
+ // Instead
+ // they have a list of PartitionDesc objects, each with a table
+ // desc.
+ // Let's
+ // try to fetch the desc for the first partition and use it's
+ // deserializer.
+ if (td == null && ft.getWork() != null && ft.getWork().getPartDesc() != null) {
+ if (ft.getWork().getPartDesc().size() > 0) {
+ td = ft.getWork().getPartDesc().get(0).getTableDesc();
+ }
+ }
+
+ if (td == null) {
+ LOG.info("No returning schema.");
+ } else {
+ String tableName = "result";
+ List<FieldSchema> lst = null;
+ try {
+ lst = MetaStoreUtils.getFieldsFromDeserializer(tableName, td.getDeserializer());
+ } catch (Exception e) {
+ LOG.warn("Error getting schema: " + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ if (lst != null) {
+ schema = new Schema(lst, null);
+ }
+ }
+ }
+ if (schema == null) {
+ schema = new Schema();
+ }
+ LOG.info("Returning Hive schema: " + schema);
+ return schema;
+ }
+
+ /**
+ * Get a Schema with fields represented with Thrift DDL types
+ */
+ public Schema getThriftSchema() throws Exception {
+ Schema schema;
+ try {
+ schema = getSchema();
+ if (schema != null) {
+ List<FieldSchema> lst = schema.getFieldSchemas();
+ // Go over the schema and convert type to thrift type
+ if (lst != null) {
+ for (FieldSchema f : lst) {
+ f.setType(MetaStoreUtils.typeToThriftType(f.getType()));
+ }
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw e;
+ }
+ LOG.info("Returning Thrift schema: " + schema);
+ return schema;
+ }
+
+ /**
+ * Return the maximum number of rows returned by getResults
+ */
+ public int getMaxRows() {
+ return maxRows;
+ }
+
+ /**
+ * Set the maximum number of rows returned by getResults
+ */
+ public void setMaxRows(int maxRows) {
+ this.maxRows = maxRows;
+ }
+
+ public boolean hasReduceTasks(List<Task<? extends Serializable>> tasks) {
+ if (tasks == null) {
+ return false;
+ }
+
+ boolean hasReduce = false;
+ for (Task<? extends Serializable> task : tasks) {
+ if (task.hasReduce()) {
+ return true;
+ }
+
+ hasReduce = (hasReduce || hasReduceTasks(task.getChildTasks()));
+ }
+ return hasReduce;
+ }
+
+ /**
+ * for backwards compatibility with current tests
+ */
+ public Driver(HiveConf conf) {
+ this.conf = conf;
+
+ // hivesterix
+ engine = new HyracksExecutionEngine(conf);
+ }
+
+ public Driver() {
+ if (SessionState.get() != null) {
+ conf = SessionState.get().getConf();
+ }
+
+ // hivesterix
+ engine = new HyracksExecutionEngine(conf);
+ }
+
+ // hivesterix: plan printer
+ public Driver(HiveConf conf, PrintWriter planPrinter) {
+ this.conf = conf;
+ engine = new HyracksExecutionEngine(conf, planPrinter);
+ }
+
+ public void clear() {
+ this.hivesterix = false;
+ }
+
+ /**
+ * Compile a new query. Any currently-planned query associated with this
+ * Driver is discarded.
+ *
+ * @param command
+ * The SQL query to compile.
+ */
+ public int compile(String command) {
+ if (plan != null) {
+ close();
+ plan = null;
+ }
+
+ TaskFactory.resetId();
+
+ try {
+ command = new VariableSubstitution().substitute(conf, command);
+ ctx = new Context(conf);
+
+ ParseDriver pd = new ParseDriver();
+ ASTNode tree = pd.parse(command, ctx);
+ tree = ParseUtils.findRootNonNullToken(tree);
+
+ BaseSemanticAnalyzer sem = SemanticAnalyzerFactory.get(conf, tree);
+ List<AbstractSemanticAnalyzerHook> saHooks = getSemanticAnalyzerHooks();
+
+ // Do semantic analysis and plan generation
+ if (saHooks != null) {
+ HiveSemanticAnalyzerHookContext hookCtx = new HiveSemanticAnalyzerHookContextImpl();
+ hookCtx.setConf(conf);
+ for (AbstractSemanticAnalyzerHook hook : saHooks) {
+ tree = hook.preAnalyze(hookCtx, tree);
+ }
+ sem.analyze(tree, ctx);
+ for (AbstractSemanticAnalyzerHook hook : saHooks) {
+ hook.postAnalyze(hookCtx, sem.getRootTasks());
+ }
+ } else {
+ sem.analyze(tree, ctx);
+ }
+
+ LOG.info("Semantic Analysis Completed");
+
+ // validate the plan
+ sem.validate();
+
+ plan = new QueryPlan(command, sem);
+ // initialize FetchTask right here
+ if (plan.getFetchTask() != null) {
+ plan.getFetchTask().initialize(conf, plan, null);
+ }
+
+ // get the output schema
+ schema = getSchema(sem, conf);
+
+ // test Only - serialize the query plan and deserialize it
+ if (sem instanceof SemanticAnalyzer && command.toLowerCase().indexOf("create") < 0) {
+
+ Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
+
+ String queryPlanFileName = ctx.getLocalScratchDir(true) + Path.SEPARATOR_CHAR + "queryplan.xml";
+ LOG.info("query plan = " + queryPlanFileName);
+ queryPlanFileName = new Path(queryPlanFileName).toUri().getPath();
+
+ // serialize the queryPlan
+ FileOutputStream fos = new FileOutputStream(queryPlanFileName);
+ Utilities.serializeQueryPlan(plan, fos);
+ fos.close();
+
+ // deserialize the queryPlan
+ FileInputStream fis = new FileInputStream(queryPlanFileName);
+ QueryPlan newPlan = Utilities.deserializeQueryPlan(fis, conf);
+ fis.close();
+
+ // Use the deserialized plan
+ plan = newPlan;
+ }
+
+ // initialize FetchTask right here
+ if (plan.getFetchTask() != null) {
+ plan.getFetchTask().initialize(conf, plan, null);
+ }
+
+ // do the authorization check
+ if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTHORIZATION_ENABLED)) {
+ try {
+ // doAuthorization(sem);
+ } catch (AuthorizationException authExp) {
+ console.printError("Authorization failed:" + authExp.getMessage()
+ + ". Use show grant to get more details.");
+ return 403;
+ }
+ }
+
+ // hyracks run
+ if (sem instanceof SemanticAnalyzer && command.toLowerCase().indexOf("create") < 0) {
+ hivesterix = true;
+ return engine.compileJob(sem.getRootTasks());
+ }
+
+ return 0;
+ } catch (SemanticException e) {
+ errorMessage = "FAILED: Error in semantic analysis: " + e.getMessage();
+ SQLState = ErrorMsg.findSQLState(e.getMessage());
+ console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return (10);
+ } catch (ParseException e) {
+ errorMessage = "FAILED: Parse Error: " + e.getMessage();
+ SQLState = ErrorMsg.findSQLState(e.getMessage());
+ console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return (11);
+ } catch (Exception e) {
+ errorMessage = "FAILED: Hive Internal Error: " + Utilities.getNameMessage(e);
+ SQLState = ErrorMsg.findSQLState(e.getMessage());
+ console.printError(errorMessage + "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return (12);
+ }
+ }
+
+ private void doAuthorization(BaseSemanticAnalyzer sem) throws HiveException, AuthorizationException {
+ HashSet<ReadEntity> inputs = sem.getInputs();
+ HashSet<WriteEntity> outputs = sem.getOutputs();
+ SessionState ss = SessionState.get();
+ HiveOperation op = ss.getHiveOperation();
+ Hive db = sem.getDb();
+ if (op != null) {
+ if (op.equals(HiveOperation.CREATETABLE_AS_SELECT) || op.equals(HiveOperation.CREATETABLE)) {
+ ss.getAuthorizer().authorize(db.getDatabase(db.getCurrentDatabase()), null,
+ HiveOperation.CREATETABLE_AS_SELECT.getOutputRequiredPrivileges());
+ } else {
+ // if (op.equals(HiveOperation.IMPORT)) {
+ // ImportSemanticAnalyzer isa = (ImportSemanticAnalyzer) sem;
+ // if (!isa.existsTable()) {
+ ss.getAuthorizer().authorize(db.getDatabase(db.getCurrentDatabase()), null,
+ HiveOperation.CREATETABLE_AS_SELECT.getOutputRequiredPrivileges());
+ // }
+ // }
+ }
+ if (outputs != null && outputs.size() > 0) {
+ for (WriteEntity write : outputs) {
+
+ if (write.getType() == WriteEntity.Type.PARTITION) {
+ Partition part = db.getPartition(write.getTable(), write.getPartition().getSpec(), false);
+ if (part != null) {
+ ss.getAuthorizer().authorize(write.getPartition(), null, op.getOutputRequiredPrivileges());
+ continue;
+ }
+ }
+
+ if (write.getTable() != null) {
+ ss.getAuthorizer().authorize(write.getTable(), null, op.getOutputRequiredPrivileges());
+ }
+ }
+
+ }
+ }
+
+ if (inputs != null && inputs.size() > 0) {
+
+ Map<Table, List<String>> tab2Cols = new HashMap<Table, List<String>>();
+ Map<Partition, List<String>> part2Cols = new HashMap<Partition, List<String>>();
+
+ Map<String, Boolean> tableUsePartLevelAuth = new HashMap<String, Boolean>();
+ for (ReadEntity read : inputs) {
+ if (read.getPartition() != null) {
+ Table tbl = read.getTable();
+ String tblName = tbl.getTableName();
+ if (tableUsePartLevelAuth.get(tblName) == null) {
+ boolean usePartLevelPriv = (tbl.getParameters().get("PARTITION_LEVEL_PRIVILEGE") != null && ("TRUE"
+ .equalsIgnoreCase(tbl.getParameters().get("PARTITION_LEVEL_PRIVILEGE"))));
+ if (usePartLevelPriv) {
+ tableUsePartLevelAuth.put(tblName, Boolean.TRUE);
+ } else {
+ tableUsePartLevelAuth.put(tblName, Boolean.FALSE);
+ }
+ }
+ }
+ }
+
+ if (op.equals(HiveOperation.CREATETABLE_AS_SELECT) || op.equals(HiveOperation.QUERY)) {
+ SemanticAnalyzer querySem = (SemanticAnalyzer) sem;
+ ParseContext parseCtx = querySem.getParseContext();
+ Map<TableScanOperator, Table> tsoTopMap = parseCtx.getTopToTable();
+
+ for (Map.Entry<String, Operator<? extends Serializable>> topOpMap : querySem.getParseContext()
+ .getTopOps().entrySet()) {
+ Operator<? extends Serializable> topOp = topOpMap.getValue();
+ if (topOp instanceof TableScanOperator && tsoTopMap.containsKey(topOp)) {
+ TableScanOperator tableScanOp = (TableScanOperator) topOp;
+ Table tbl = tsoTopMap.get(tableScanOp);
+ List<Integer> neededColumnIds = tableScanOp.getNeededColumnIDs();
+ List<FieldSchema> columns = tbl.getCols();
+ List<String> cols = new ArrayList<String>();
+ if (neededColumnIds != null && neededColumnIds.size() > 0) {
+ for (int i = 0; i < neededColumnIds.size(); i++) {
+ cols.add(columns.get(neededColumnIds.get(i)).getName());
+ }
+ } else {
+ for (int i = 0; i < columns.size(); i++) {
+ cols.add(columns.get(i).getName());
+ }
+ }
+ if (tbl.isPartitioned() && tableUsePartLevelAuth.get(tbl.getTableName())) {
+ String alias_id = topOpMap.getKey();
+ PrunedPartitionList partsList = PartitionPruner.prune(parseCtx.getTopToTable().get(topOp),
+ parseCtx.getOpToPartPruner().get(topOp), parseCtx.getConf(), alias_id,
+ parseCtx.getPrunedPartitions());
+ Set<Partition> parts = new HashSet<Partition>();
+ parts.addAll(partsList.getConfirmedPartns());
+ parts.addAll(partsList.getUnknownPartns());
+ for (Partition part : parts) {
+ List<String> existingCols = part2Cols.get(part);
+ if (existingCols == null) {
+ existingCols = new ArrayList<String>();
+ }
+ existingCols.addAll(cols);
+ part2Cols.put(part, existingCols);
+ }
+ } else {
+ List<String> existingCols = tab2Cols.get(tbl);
+ if (existingCols == null) {
+ existingCols = new ArrayList<String>();
+ }
+ existingCols.addAll(cols);
+ tab2Cols.put(tbl, existingCols);
+ }
+ }
+ }
+ }
+
+ // cache the results for table authorization
+ Set<String> tableAuthChecked = new HashSet<String>();
+ for (ReadEntity read : inputs) {
+ Table tbl = null;
+ if (read.getPartition() != null) {
+ tbl = read.getPartition().getTable();
+ // use partition level authorization
+ if (tableUsePartLevelAuth.get(tbl.getTableName())) {
+ List<String> cols = part2Cols.get(read.getPartition());
+ if (cols != null && cols.size() > 0) {
+ ss.getAuthorizer().authorize(read.getPartition().getTable(), read.getPartition(), cols,
+ op.getInputRequiredPrivileges(), null);
+ } else {
+ ss.getAuthorizer().authorize(read.getPartition(), op.getInputRequiredPrivileges(), null);
+ }
+ continue;
+ }
+ } else if (read.getTable() != null) {
+ tbl = read.getTable();
+ }
+
+ // if we reach here, it means it needs to do a table
+ // authorization
+ // check, and the table authorization may already happened
+ // because of other
+ // partitions
+ if (tbl != null && !tableAuthChecked.contains(tbl.getTableName())) {
+ List<String> cols = tab2Cols.get(tbl);
+ if (cols != null && cols.size() > 0) {
+ ss.getAuthorizer().authorize(tbl, null, cols, op.getInputRequiredPrivileges(), null);
+ } else {
+ ss.getAuthorizer().authorize(tbl, op.getInputRequiredPrivileges(), null);
+ }
+ tableAuthChecked.add(tbl.getTableName());
+ }
+ }
+
+ }
+ }
+
+ /**
+ * @return The current query plan associated with this Driver, if any.
+ */
+ public QueryPlan getPlan() {
+ return plan;
+ }
+
+ /**
+ * @param t
+ * The table to be locked
+ * @param p
+ * The partition to be locked
+ * @param mode
+ * The mode of the lock (SHARED/EXCLUSIVE) Get the list of
+ * objects to be locked. If a partition needs to be locked (in
+ * any mode), all its parents should also be locked in SHARED
+ * mode.
+ **/
+ private List<HiveLockObj> getLockObjects(Table t, Partition p, HiveLockMode mode) throws SemanticException {
+ List<HiveLockObj> locks = new LinkedList<HiveLockObj>();
+
+ HiveLockObjectData lockData = new HiveLockObjectData(plan.getQueryId(), String.valueOf(System
+ .currentTimeMillis()), "IMPLICIT");
+
+ if (t != null) {
+ locks.add(new HiveLockObj(new HiveLockObject(t, lockData), mode));
+ mode = HiveLockMode.SHARED;
+ locks.add(new HiveLockObj(new HiveLockObject(t.getDbName(), lockData), mode));
+ return locks;
+ }
+
+ if (p != null) {
+ if (!(p instanceof DummyPartition)) {
+ locks.add(new HiveLockObj(new HiveLockObject(p, lockData), mode));
+ }
+
+ // All the parents are locked in shared mode
+ mode = HiveLockMode.SHARED;
+
+ // For dummy partitions, only partition name is needed
+ String name = p.getName();
+
+ if (p instanceof DummyPartition) {
+ name = p.getName().split("@")[2];
+ }
+
+ String partName = name;
+ String partialName = "";
+ String[] partns = name.split("/");
+ int len = p instanceof DummyPartition ? partns.length : partns.length - 1;
+ for (int idx = 0; idx < len; idx++) {
+ String partn = partns[idx];
+ partialName += partn;
+ try {
+ locks.add(new HiveLockObj(new HiveLockObject(new DummyPartition(p.getTable(), p.getTable()
+ .getDbName() + "/" + p.getTable().getTableName() + "/" + partialName), lockData), mode));
+ partialName += "/";
+ } catch (HiveException e) {
+ throw new SemanticException(e.getMessage());
+ }
+ }
+
+ locks.add(new HiveLockObj(new HiveLockObject(p.getTable(), lockData), mode));
+ locks.add(new HiveLockObj(new HiveLockObject(p.getTable().getDbName(), lockData), mode));
+ }
+ return locks;
+ }
+
+ /**
+ * Acquire read and write locks needed by the statement. The list of objects
+ * to be locked are obtained from he inputs and outputs populated by the
+ * compiler. The lock acuisition scheme is pretty simple. If all the locks
+ * cannot be obtained, error out. Deadlock is avoided by making sure that
+ * the locks are lexicographically sorted.
+ **/
+ public int acquireReadWriteLocks() {
+ try {
+ int sleepTime = conf.getIntVar(HiveConf.ConfVars.HIVE_LOCK_SLEEP_BETWEEN_RETRIES) * 1000;
+ int numRetries = conf.getIntVar(HiveConf.ConfVars.HIVE_LOCK_NUMRETRIES);
+
+ boolean supportConcurrency = conf.getBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY);
+ if (!supportConcurrency) {
+ return 0;
+ }
+
+ List<HiveLockObj> lockObjects = new ArrayList<HiveLockObj>();
+
+ // Sort all the inputs, outputs.
+ // If a lock needs to be acquired on any partition, a read lock
+ // needs to be acquired on all
+ // its parents also
+ for (ReadEntity input : plan.getInputs()) {
+ if (input.getType() == ReadEntity.Type.TABLE) {
+ lockObjects.addAll(getLockObjects(input.getTable(), null, HiveLockMode.SHARED));
+ } else {
+ lockObjects.addAll(getLockObjects(null, input.getPartition(), HiveLockMode.SHARED));
+ }
+ }
+
+ for (WriteEntity output : plan.getOutputs()) {
+ if (output.getTyp() == WriteEntity.Type.TABLE) {
+ lockObjects.addAll(getLockObjects(output.getTable(), null,
+ output.isComplete() ? HiveLockMode.EXCLUSIVE : HiveLockMode.SHARED));
+ } else if (output.getTyp() == WriteEntity.Type.PARTITION) {
+ lockObjects.addAll(getLockObjects(null, output.getPartition(), HiveLockMode.EXCLUSIVE));
+ }
+ // In case of dynamic queries, it is possible to have incomplete
+ // dummy partitions
+ else if (output.getTyp() == WriteEntity.Type.DUMMYPARTITION) {
+ lockObjects.addAll(getLockObjects(null, output.getPartition(), HiveLockMode.SHARED));
+ }
+ }
+
+ if (lockObjects.isEmpty() && !ctx.isNeedLockMgr()) {
+ return 0;
+ }
+
+ int ret = checkLockManager();
+ if (ret != 0) {
+ return ret;
+ }
+
+ HiveLockObjectData lockData = new HiveLockObjectData(plan.getQueryId(), String.valueOf(System
+ .currentTimeMillis()), "IMPLICIT");
+
+ // Lock the database also
+ try {
+ Hive db = Hive.get(conf);
+ lockObjects.add(new HiveLockObj(new HiveLockObject(db.getCurrentDatabase(), lockData),
+ HiveLockMode.SHARED));
+ } catch (HiveException e) {
+ throw new SemanticException(e.getMessage());
+ }
+
+ ctx.setHiveLockMgr(hiveLockMgr);
+ List<HiveLock> hiveLocks = null;
+
+ int tryNum = 1;
+ do {
+
+ // ctx.getHiveLockMgr();
+ // hiveLocks = ctx.getHiveLockMgr().lock(lockObjects, false);
+
+ if (hiveLocks != null) {
+ break;
+ }
+
+ tryNum++;
+ try {
+ Thread.sleep(sleepTime);
+ } catch (InterruptedException e) {
+ }
+ } while (tryNum < numRetries);
+
+ if (hiveLocks == null) {
+ throw new SemanticException(ErrorMsg.LOCK_CANNOT_BE_ACQUIRED.getMsg());
+ } else {
+ ctx.setHiveLocks(hiveLocks);
+ }
+
+ return (0);
+ } catch (SemanticException e) {
+ errorMessage = "FAILED: Error in acquiring locks: " + e.getMessage();
+ SQLState = ErrorMsg.findSQLState(e.getMessage());
+ console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return (10);
+ } catch (Exception e) {
+ errorMessage = "FAILED: Error in acquiring locks: " + e.getMessage();
+ SQLState = ErrorMsg.findSQLState(e.getMessage());
+ console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return (10);
+ }
+ }
+
+ /**
+ * Release all the locks acquired implicitly by the statement. Note that the
+ * locks acquired with 'keepAlive' set to True are not released.
+ **/
+ private void releaseLocks() {
+ if (ctx != null && ctx.getHiveLockMgr() != null) {
+ try {
+ ctx.getHiveLockMgr().close();
+ ctx.setHiveLocks(null);
+ } catch (LockException e) {
+ }
+ }
+ }
+
+ /**
+ * @param hiveLocks
+ * list of hive locks to be released Release all the locks
+ * specified. If some of the locks have already been released,
+ * ignore them
+ **/
+ private void releaseLocks(List<HiveLock> hiveLocks) {
+ if (hiveLocks != null) {
+ ctx.getHiveLockMgr().releaseLocks(hiveLocks);
+ }
+ ctx.setHiveLocks(null);
+ }
+
+ public CommandProcessorResponse run(String command) {
+ errorMessage = null;
+ SQLState = null;
+
+ int ret = compile(command);
+ if (ret != 0) {
+ // releaseLocks(ctx.getHiveLocks());
+ return new CommandProcessorResponse(ret, errorMessage, SQLState);
+ }
+
+ // ret = acquireReadWriteLocks();
+ if (ret != 0) {
+ // releaseLocks(ctx.getHiveLocks());
+ return new CommandProcessorResponse(ret, errorMessage, SQLState);
+ }
+
+ ret = execute();
+ if (ret != 0) {
+ // releaseLocks(ctx.getHiveLocks());
+ return new CommandProcessorResponse(ret, errorMessage, SQLState);
+ }
+
+ // releaseLocks(ctx.getHiveLocks());
+ return new CommandProcessorResponse(ret);
+ }
+
+ private List<AbstractSemanticAnalyzerHook> getSemanticAnalyzerHooks() throws Exception {
+ ArrayList<AbstractSemanticAnalyzerHook> saHooks = new ArrayList<AbstractSemanticAnalyzerHook>();
+ String pestr = conf.getVar(HiveConf.ConfVars.SEMANTIC_ANALYZER_HOOK);
+ if (pestr == null) {
+ return saHooks;
+ }
+ pestr = pestr.trim();
+ if (pestr.equals("")) {
+ return saHooks;
+ }
+
+ String[] peClasses = pestr.split(",");
+
+ for (String peClass : peClasses) {
+ try {
+ AbstractSemanticAnalyzerHook hook = HiveUtils.getSemanticAnalyzerHook(conf, peClass);
+ saHooks.add(hook);
+ } catch (HiveException e) {
+ console.printError("Pre Exec Hook Class not found:" + e.getMessage());
+ throw e;
+ }
+ }
+
+ return saHooks;
+ }
+
+ private List<Hook> getPreExecHooks() throws Exception {
+ ArrayList<Hook> pehooks = new ArrayList<Hook>();
+ String pestr = conf.getVar(HiveConf.ConfVars.PREEXECHOOKS);
+ pestr = pestr.trim();
+ if (pestr.equals("")) {
+ return pehooks;
+ }
+
+ String[] peClasses = pestr.split(",");
+
+ for (String peClass : peClasses) {
+ try {
+ pehooks.add((Hook) Class.forName(peClass.trim(), true, JavaUtils.getClassLoader()).newInstance());
+ } catch (ClassNotFoundException e) {
+ console.printError("Pre Exec Hook Class not found:" + e.getMessage());
+ throw e;
+ }
+ }
+
+ return pehooks;
+ }
+
+ private List<Hook> getPostExecHooks() throws Exception {
+ ArrayList<Hook> pehooks = new ArrayList<Hook>();
+ String pestr = conf.getVar(HiveConf.ConfVars.POSTEXECHOOKS);
+ pestr = pestr.trim();
+ if (pestr.equals("")) {
+ return pehooks;
+ }
+
+ String[] peClasses = pestr.split(",");
+
+ for (String peClass : peClasses) {
+ try {
+ pehooks.add((Hook) Class.forName(peClass.trim(), true, JavaUtils.getClassLoader()).newInstance());
+ } catch (ClassNotFoundException e) {
+ console.printError("Post Exec Hook Class not found:" + e.getMessage());
+ throw e;
+ }
+ }
+
+ return pehooks;
+ }
+
+ public int execute() {
+ // execute hivesterix plan
+ if (hivesterix) {
+ hivesterix = false;
+ int ret = engine.executeJob();
+ if (ret != 0)
+ return ret;
+ }
+
+ boolean noName = StringUtils.isEmpty(conf.getVar(HiveConf.ConfVars.HADOOPJOBNAME));
+ int maxlen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH);
+
+ String queryId = plan.getQueryId();
+ String queryStr = plan.getQueryStr();
+
+ conf.setVar(HiveConf.ConfVars.HIVEQUERYID, queryId);
+ conf.setVar(HiveConf.ConfVars.HIVEQUERYSTRING, queryStr);
+ maxthreads = HiveConf.getIntVar(conf, HiveConf.ConfVars.EXECPARALLETHREADNUMBER);
+
+ try {
+ LOG.info("Starting command: " + queryStr);
+
+ plan.setStarted();
+
+ if (SessionState.get() != null) {
+ SessionState.get().getHiveHistory().startQuery(queryStr, conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
+ SessionState.get().getHiveHistory().logPlanProgress(plan);
+ }
+ resStream = null;
+
+ HookContext hookContext = new HookContext(plan, conf);
+
+ for (Hook peh : getPreExecHooks()) {
+ if (peh instanceof ExecuteWithHookContext) {
+ ((ExecuteWithHookContext) peh).run(hookContext);
+ } else if (peh instanceof PreExecute) {
+ ((PreExecute) peh).run(SessionState.get(), plan.getInputs(), plan.getOutputs(), ShimLoader
+ .getHadoopShims().getUGIForConf(conf));
+ }
+ }
+
+ int jobs = Utilities.getMRTasks(plan.getRootTasks()).size();
+ if (jobs > 0) {
+ console.printInfo("Total MapReduce jobs = " + jobs);
+ }
+ if (SessionState.get() != null) {
+ SessionState.get().getHiveHistory()
+ .setQueryProperty(queryId, Keys.QUERY_NUM_TASKS, String.valueOf(jobs));
+ SessionState.get().getHiveHistory().setIdToTableMap(plan.getIdToTableNameMap());
+ }
+ String jobname = Utilities.abbreviate(queryStr, maxlen - 6);
+
+ // A runtime that launches runnable tasks as separate Threads
+ // through
+ // TaskRunners
+ // As soon as a task isRunnable, it is put in a queue
+ // At any time, at most maxthreads tasks can be running
+ // The main thread polls the TaskRunners to check if they have
+ // finished.
+
+ Queue<Task<? extends Serializable>> runnable = new LinkedList<Task<? extends Serializable>>();
+ Map<TaskResult, TaskRunner> running = new HashMap<TaskResult, TaskRunner>();
+
+ DriverContext driverCxt = new DriverContext(runnable, ctx);
+
+ // Add root Tasks to runnable
+
+ for (Task<? extends Serializable> tsk : plan.getRootTasks()) {
+ driverCxt.addToRunnable(tsk);
+ }
+
+ // Loop while you either have tasks running, or tasks queued up
+
+ while (running.size() != 0 || runnable.peek() != null) {
+ // Launch upto maxthreads tasks
+ while (runnable.peek() != null && running.size() < maxthreads) {
+ Task<? extends Serializable> tsk = runnable.remove();
+ console.printInfo("executing task " + tsk.getName());
+ launchTask(tsk, queryId, noName, running, jobname, jobs, driverCxt);
+ }
+
+ // poll the Tasks to see which one completed
+ TaskResult tskRes = pollTasks(running.keySet());
+ TaskRunner tskRun = running.remove(tskRes);
+ Task<? extends Serializable> tsk = tskRun.getTask();
+ hookContext.addCompleteTask(tskRun);
+
+ int exitVal = tskRes.getExitVal();
+ if (exitVal != 0) {
+ Task<? extends Serializable> backupTask = tsk.getAndInitBackupTask();
+ if (backupTask != null) {
+ errorMessage = "FAILED: Execution Error, return code " + exitVal + " from "
+ + tsk.getClass().getName();
+ console.printError(errorMessage);
+
+ errorMessage = "ATTEMPT: Execute BackupTask: " + backupTask.getClass().getName();
+ console.printError(errorMessage);
+
+ // add backup task to runnable
+ if (DriverContext.isLaunchable(backupTask)) {
+ driverCxt.addToRunnable(backupTask);
+ }
+ continue;
+
+ } else {
+ // TODO: This error messaging is not very informative.
+ // Fix that.
+ errorMessage = "FAILED: Execution Error, return code " + exitVal + " from "
+ + tsk.getClass().getName();
+ SQLState = "08S01";
+ console.printError(errorMessage);
+ if (running.size() != 0) {
+ taskCleanup();
+ }
+ // in case we decided to run everything in local mode,
+ // restore the
+ // the jobtracker setting to its initial value
+ ctx.restoreOriginalTracker();
+ return 9;
+ }
+ }
+
+ if (SessionState.get() != null) {
+ SessionState.get().getHiveHistory()
+ .setTaskProperty(queryId, tsk.getId(), Keys.TASK_RET_CODE, String.valueOf(exitVal));
+ SessionState.get().getHiveHistory().endTask(queryId, tsk);
+ }
+
+ if (tsk.getChildTasks() != null) {
+ for (Task<? extends Serializable> child : tsk.getChildTasks()) {
+ // hivesterix: don't check launchable condition
+ // if (DriverContext.isLaunchable(child)) {
+ driverCxt.addToRunnable(child);
+ // }
+ }
+ }
+ }
+
+ // in case we decided to run everything in local mode, restore the
+ // the jobtracker setting to its initial value
+ ctx.restoreOriginalTracker();
+
+ // remove incomplete outputs.
+ // Some incomplete outputs may be added at the beginning, for eg:
+ // for dynamic partitions.
+ // remove them
+ HashSet<WriteEntity> remOutputs = new HashSet<WriteEntity>();
+ for (WriteEntity output : plan.getOutputs()) {
+ if (!output.isComplete()) {
+ remOutputs.add(output);
+ }
+ }
+
+ for (WriteEntity output : remOutputs) {
+ plan.getOutputs().remove(output);
+ }
+
+ // Get all the post execution hooks and execute them.
+ for (Hook peh : getPostExecHooks()) {
+ if (peh instanceof ExecuteWithHookContext) {
+ ((ExecuteWithHookContext) peh).run(hookContext);
+ } else if (peh instanceof PostExecute) {
+ ((PostExecute) peh)
+ .run(SessionState.get(), plan.getInputs(), plan.getOutputs(),
+ (SessionState.get() != null ? SessionState.get().getLineageState().getLineageInfo()
+ : null), ShimLoader.getHadoopShims().getUGIForConf(conf));
+ }
+ }
+
+ if (SessionState.get() != null) {
+ SessionState.get().getHiveHistory().setQueryProperty(queryId, Keys.QUERY_RET_CODE, String.valueOf(0));
+ SessionState.get().getHiveHistory().printRowCount(queryId);
+ }
+ } catch (Exception e) {
+ if (SessionState.get() != null) {
+ SessionState.get().getHiveHistory().setQueryProperty(queryId, Keys.QUERY_RET_CODE, String.valueOf(12));
+ }
+ // TODO: do better with handling types of Exception here
+ errorMessage = "FAILED: Hive Internal Error: " + Utilities.getNameMessage(e);
+ SQLState = "08S01";
+ console.printError(errorMessage + "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return (12);
+ } finally {
+ if (SessionState.get() != null) {
+ SessionState.get().getHiveHistory().endQuery(queryId);
+ }
+ if (noName) {
+ conf.setVar(HiveConf.ConfVars.HADOOPJOBNAME, "");
+ }
+ }
+ plan.setDone();
+
+ if (SessionState.get() != null) {
+ try {
+ SessionState.get().getHiveHistory().logPlanProgress(plan);
+ } catch (Exception e) {
+ }
+ }
+ console.printInfo("OK");
+
+ return (0);
+ }
+
+ /**
+ * Launches a new task
+ *
+ * @param tsk
+ * task being launched
+ * @param queryId
+ * Id of the query containing the task
+ * @param noName
+ * whether the task has a name set
+ * @param running
+ * map from taskresults to taskrunners
+ * @param jobname
+ * name of the task, if it is a map-reduce job
+ * @param jobs
+ * number of map-reduce jobs
+ * @param curJobNo
+ * the sequential number of the next map-reduce job
+ * @return the updated number of last the map-reduce job launched
+ */
+
+ public void launchTask(Task<? extends Serializable> tsk, String queryId, boolean noName,
+ Map<TaskResult, TaskRunner> running, String jobname, int jobs, DriverContext cxt) {
+
+ if (SessionState.get() != null) {
+ SessionState.get().getHiveHistory().startTask(queryId, tsk, tsk.getClass().getName());
+ }
+ if (tsk.isMapRedTask() && !(tsk instanceof ConditionalTask)) {
+ if (noName) {
+ conf.setVar(HiveConf.ConfVars.HADOOPJOBNAME, jobname + "(" + tsk.getId() + ")");
+ }
+ cxt.incCurJobNo(1);
+ console.printInfo("Launching Job " + cxt.getCurJobNo() + " out of " + jobs);
+ }
+ tsk.initialize(conf, plan, cxt);
+ TaskResult tskRes = new TaskResult();
+ TaskRunner tskRun = new TaskRunner(tsk, tskRes);
+
+ // HiveConf.getBoolVar(conf, HiveConf.ConfVars.EXECPARALLEL) &&
+ // Launch Task: hivesterix tweak
+ if (tsk instanceof MapRedTask || tsk instanceof StatsTask) {
+ // Launch it in the parallel mode, as a separate thread only for MR
+ // tasks
+ tskRes.setRunning(false);
+ tskRes.setExitVal(0);
+ } else if (tsk instanceof ConditionalTask) {
+ ConditionalTask condTask = (ConditionalTask) tsk;
+ ConditionalResolver crs = condTask.getResolver();
+ if (crs instanceof ConditionalResolverMergeFiles) {
+ tskRes.setRunning(false);
+ tskRes.setExitVal(0);
+
+ List<Task<? extends Serializable>> children = condTask.getListTasks();
+ for (Task<? extends Serializable> child : children)
+ if (child instanceof MapRedTask)
+ cxt.addToRunnable(child);
+ }
+ } else {
+ tskRun.runSequential();
+ }
+ running.put(tskRes, tskRun);
+ return;
+ }
+
+ /**
+ * Cleans up remaining tasks in case of failure
+ */
+
+ public void taskCleanup() {
+ // The currently existing Shutdown hooks will be automatically called,
+ // killing the map-reduce processes.
+ // The non MR processes will be killed as well.
+ System.exit(9);
+ }
+
+ /**
+ * Polls running tasks to see if a task has ended.
+ *
+ * @param results
+ * Set of result objects for running tasks
+ * @return The result object for any completed/failed task
+ */
+
+ public TaskResult pollTasks(Set<TaskResult> results) {
+ Iterator<TaskResult> resultIterator = results.iterator();
+ while (true) {
+ while (resultIterator.hasNext()) {
+ TaskResult tskRes = resultIterator.next();
+ if (tskRes.isRunning() == false) {
+ return tskRes;
+ }
+ }
+
+ // In this loop, nothing was found
+ // Sleep 10 seconds and restart
+ try {
+ Thread.sleep(sleeptime);
+ } catch (InterruptedException ie) {
+ // Do Nothing
+ ;
+ }
+ resultIterator = results.iterator();
+ }
+ }
+
+ public boolean getResults(ArrayList<String> res) throws IOException {
+ if (plan != null && plan.getFetchTask() != null) {
+ FetchTask ft = plan.getFetchTask();
+ ft.setMaxRows(maxRows);
+ return ft.fetch(res);
+ }
+
+ if (resStream == null) {
+ resStream = ctx.getStream();
+ }
+ if (resStream == null) {
+ return false;
+ }
+
+ int numRows = 0;
+ String row = null;
+
+ while (numRows < maxRows) {
+ if (resStream == null) {
+ if (numRows > 0) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ bos.reset();
+ Utilities.StreamStatus ss;
+ try {
+ ss = Utilities.readColumn(resStream, bos);
+ if (bos.getCount() > 0) {
+ row = new String(bos.getData(), 0, bos.getCount(), "UTF-8");
+ } else if (ss == Utilities.StreamStatus.TERMINATED) {
+ row = new String();
+ }
+
+ if (row != null) {
+ numRows++;
+ res.add(row);
+ }
+ } catch (IOException e) {
+ console.printError("FAILED: Unexpected IO exception : " + e.getMessage());
+ res = null;
+ return false;
+ }
+
+ if (ss == Utilities.StreamStatus.EOF) {
+ resStream = ctx.getStream();
+ }
+ }
+ return true;
+ }
+
+ public int close() {
+ try {
+ if (plan != null) {
+ FetchTask fetchTask = plan.getFetchTask();
+ if (null != fetchTask) {
+ try {
+ fetchTask.clearFetch();
+ } catch (Exception e) {
+ LOG.debug(" Exception while clearing the Fetch task ", e);
+ }
+ }
+ }
+ if (ctx != null) {
+ ctx.clear();
+ }
+ if (null != resStream) {
+ try {
+ ((FSDataInputStream) resStream).close();
+ } catch (Exception e) {
+ LOG.debug(" Exception while closing the resStream ", e);
+ }
+ }
+ } catch (Exception e) {
+ console.printError("FAILED: Hive Internal Error: " + Utilities.getNameMessage(e) + "\n"
+ + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return 13;
+ }
+
+ return 0;
+ }
+
+ public void destroy() {
+ releaseLocks();
+ }
+
+ public org.apache.hadoop.hive.ql.plan.api.Query getQueryPlan() throws IOException {
+ return plan.getQueryPlan();
+ }
+
+ public int getTryCount() {
+ return tryCount;
+ }
+
+ public void setTryCount(int tryCount) {
+ this.tryCount = tryCount;
+ }
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java
new file mode 100644
index 0000000..0f445f4
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java
@@ -0,0 +1,233 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.util.StringUtils;
+
+import edu.uci.ics.hivesterix.runtime.evaluator.BufferSerDeUtil;
+import edu.uci.ics.hivesterix.runtime.evaluator.SerializableBuffer;
+
+/**
+ * GenericUDAFAverage.
+ */
+@Description(name = "avg", value = "_FUNC_(x) - Returns the mean of a set of numbers")
+public class GenericUDAFAverage extends AbstractGenericUDAFResolver {
+
+ static final Log LOG = LogFactory.getLog(GenericUDAFAverage.class.getName());
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+ if (parameters.length != 1) {
+ throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected.");
+ }
+
+ if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case STRING:
+ return new GenericUDAFAverageEvaluator();
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(0, "Only numeric or string type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ }
+
+ /**
+ * GenericUDAFAverageEvaluator.
+ */
+ public static class GenericUDAFAverageEvaluator extends GenericUDAFEvaluator {
+
+ // For PARTIAL1 and COMPLETE
+ PrimitiveObjectInspector inputOI;
+
+ // For PARTIAL2 and FINAL
+ StructObjectInspector soi;
+ StructField countField;
+ StructField sumField;
+ LongObjectInspector countFieldOI;
+ DoubleObjectInspector sumFieldOI;
+
+ // For PARTIAL1 and PARTIAL2
+ Object[] partialResult;
+
+ // For FINAL and COMPLETE
+ DoubleWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ super.init(m, parameters);
+
+ // init input
+ if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
+ inputOI = (PrimitiveObjectInspector) parameters[0];
+ } else {
+ soi = (StructObjectInspector) parameters[0];
+ countField = soi.getStructFieldRef("count");
+ sumField = soi.getStructFieldRef("sum");
+ countFieldOI = (LongObjectInspector) countField.getFieldObjectInspector();
+ sumFieldOI = (DoubleObjectInspector) sumField.getFieldObjectInspector();
+ }
+
+ // init output
+ if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
+ // The output of a partial aggregation is a struct containing
+ // a "long" count and a "double" sum.
+
+ ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>();
+ foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ ArrayList<String> fname = new ArrayList<String>();
+ fname.add("count");
+ fname.add("sum");
+ partialResult = new Object[2];
+ partialResult[0] = new LongWritable(0);
+ partialResult[1] = new DoubleWritable(0);
+ return ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi);
+
+ } else {
+ result = new DoubleWritable(0);
+ return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+ }
+ }
+
+ static class AverageAgg implements SerializableBuffer {
+ long count;
+ double sum;
+
+ @Override
+ public void deSerializeAggBuffer(byte[] data, int start, int len) {
+ count = BufferSerDeUtil.getLong(data, start);
+ start += 8;
+ sum = BufferSerDeUtil.getDouble(data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(byte[] data, int start, int len) {
+ BufferSerDeUtil.writeLong(count, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(sum, data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(DataOutput output) throws IOException {
+ output.writeLong(count);
+ output.writeDouble(sum);
+ }
+ };
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ AverageAgg result = new AverageAgg();
+ reset(result);
+ return result;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ AverageAgg myagg = (AverageAgg) agg;
+ myagg.count = 0;
+ myagg.sum = 0;
+ }
+
+ boolean warned = false;
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ Object p = parameters[0];
+ if (p != null) {
+ AverageAgg myagg = (AverageAgg) agg;
+ try {
+ double v = PrimitiveObjectInspectorUtils.getDouble(p, inputOI);
+ myagg.count++;
+ myagg.sum += v;
+ } catch (NumberFormatException e) {
+ if (!warned) {
+ warned = true;
+ LOG.warn(getClass().getSimpleName() + " " + StringUtils.stringifyException(e));
+ LOG.warn(getClass().getSimpleName() + " ignoring similar exceptions.");
+ }
+ }
+ }
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ AverageAgg myagg = (AverageAgg) agg;
+ ((LongWritable) partialResult[0]).set(myagg.count);
+ ((DoubleWritable) partialResult[1]).set(myagg.sum);
+ return partialResult;
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ AverageAgg myagg = (AverageAgg) agg;
+ Object partialCount = soi.getStructFieldData(partial, countField);
+ Object partialSum = soi.getStructFieldData(partial, sumField);
+ myagg.count += countFieldOI.get(partialCount);
+ myagg.sum += sumFieldOI.get(partialSum);
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ AverageAgg myagg = (AverageAgg) agg;
+ if (myagg.count == 0) {
+ return null;
+ } else {
+ result.set(myagg.sum / myagg.count);
+ return result;
+ }
+ }
+ }
+
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCorrelation.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCorrelation.java
new file mode 100644
index 0000000..2c4022e
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCorrelation.java
@@ -0,0 +1,392 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+
+import edu.uci.ics.hivesterix.runtime.evaluator.BufferSerDeUtil;
+import edu.uci.ics.hivesterix.runtime.evaluator.SerializableBuffer;
+
+/**
+ * Compute the Pearson correlation coefficient corr(x, y), using the following
+ * stable one-pass method, based on: "Formulas for Robust, One-Pass Parallel
+ * Computation of Covariances and Arbitrary-Order Statistical Moments", Philippe
+ * Pebay, Sandia Labs and
+ * "The Art of Computer Programming, volume 2: Seminumerical Algorithms", Donald
+ * Knuth.
+ * Incremental: n : <count> mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg> my_n =
+ * my_(n-1) + [y_n - my_(n-1)]/n : <yavg> c_n = c_(n-1) + (x_n - mx_(n-1))*(y_n
+ * - my_n) : <covariance * n> vx_n = vx_(n-1) + (x_n - mx_n)(x_n - mx_(n-1)):
+ * <variance * n> vy_n = vy_(n-1) + (y_n - my_n)(y_n - my_(n-1)): <variance * n>
+ * Merge: c_(A,B) = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/(n_A+n_B)
+ * vx_(A,B) = vx_A + vx_B + (mx_A - mx_B)*(mx_A - mx_B)*n_A*n_B/(n_A+n_B)
+ * vy_(A,B) = vy_A + vy_B + (my_A - my_B)*(my_A - my_B)*n_A*n_B/(n_A+n_B)
+ */
+@Description(name = "corr", value = "_FUNC_(x,y) - Returns the Pearson coefficient of correlation\n"
+ + "between a set of number pairs", extended = "The function takes as arguments any pair of numeric types and returns a double.\n"
+ + "Any pair with a NULL is ignored. If the function is applied to an empty set or\n"
+ + "a singleton set, NULL will be returned. Otherwise, it computes the following:\n"
+ + " COVAR_POP(x,y)/(STDDEV_POP(x)*STDDEV_POP(y))\n"
+ + "where neither x nor y is null,\n"
+ + "COVAR_POP is the population covariance,\n" + "and STDDEV_POP is the population standard deviation.")
+public class GenericUDAFCorrelation extends AbstractGenericUDAFResolver {
+
+ static final Log LOG = LogFactory.getLog(GenericUDAFCorrelation.class.getName());
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+ if (parameters.length != 2) {
+ throw new UDFArgumentTypeException(parameters.length - 1, "Exactly two arguments are expected.");
+ }
+
+ if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+
+ if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(1, "Only primitive type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+
+ switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ switch (((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ return new GenericUDAFCorrelationEvaluator();
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(1, "Only numeric type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(0, "Only numeric type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ }
+
+ /**
+ * Evaluate the Pearson correlation coefficient using a stable one-pass
+ * algorithm, based on work by Philippe Pébay and Donald Knuth.
+ * Incremental: n : <count> mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg>
+ * my_n = my_(n-1) + [y_n - my_(n-1)]/n : <yavg> c_n = c_(n-1) + (x_n -
+ * mx_(n-1))*(y_n - my_n) : <covariance * n> vx_n = vx_(n-1) + (x_n -
+ * mx_n)(x_n - mx_(n-1)): <variance * n> vy_n = vy_(n-1) + (y_n - my_n)(y_n
+ * - my_(n-1)): <variance * n>
+ * Merge: c_X = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/n_X vx_(A,B)
+ * = vx_A + vx_B + (mx_A - mx_B)*(mx_A - mx_B)*n_A*n_B/(n_A+n_B) vy_(A,B) =
+ * vy_A + vy_B + (my_A - my_B)*(my_A - my_B)*n_A*n_B/(n_A+n_B)
+ */
+ public static class GenericUDAFCorrelationEvaluator extends GenericUDAFEvaluator {
+
+ // For PARTIAL1 and COMPLETE
+ private PrimitiveObjectInspector xInputOI;
+ private PrimitiveObjectInspector yInputOI;
+
+ // For PARTIAL2 and FINAL
+ private StructObjectInspector soi;
+ private StructField countField;
+ private StructField xavgField;
+ private StructField yavgField;
+ private StructField xvarField;
+ private StructField yvarField;
+ private StructField covarField;
+ private LongObjectInspector countFieldOI;
+ private DoubleObjectInspector xavgFieldOI;
+ private DoubleObjectInspector yavgFieldOI;
+ private DoubleObjectInspector xvarFieldOI;
+ private DoubleObjectInspector yvarFieldOI;
+ private DoubleObjectInspector covarFieldOI;
+
+ // For PARTIAL1 and PARTIAL2
+ private Object[] partialResult;
+
+ // For FINAL and COMPLETE
+ private DoubleWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ super.init(m, parameters);
+
+ // init input
+ if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
+ assert (parameters.length == 2);
+ xInputOI = (PrimitiveObjectInspector) parameters[0];
+ yInputOI = (PrimitiveObjectInspector) parameters[1];
+ } else {
+ assert (parameters.length == 1);
+ soi = (StructObjectInspector) parameters[0];
+
+ countField = soi.getStructFieldRef("count");
+ xavgField = soi.getStructFieldRef("xavg");
+ yavgField = soi.getStructFieldRef("yavg");
+ xvarField = soi.getStructFieldRef("xvar");
+ yvarField = soi.getStructFieldRef("yvar");
+ covarField = soi.getStructFieldRef("covar");
+
+ countFieldOI = (LongObjectInspector) countField.getFieldObjectInspector();
+ xavgFieldOI = (DoubleObjectInspector) xavgField.getFieldObjectInspector();
+ yavgFieldOI = (DoubleObjectInspector) yavgField.getFieldObjectInspector();
+ xvarFieldOI = (DoubleObjectInspector) xvarField.getFieldObjectInspector();
+ yvarFieldOI = (DoubleObjectInspector) yvarField.getFieldObjectInspector();
+ covarFieldOI = (DoubleObjectInspector) covarField.getFieldObjectInspector();
+ }
+
+ // init output
+ if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
+ // The output of a partial aggregation is a struct containing
+ // a long count, two double averages, two double variances,
+ // and a double covariance.
+
+ ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>();
+
+ foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+
+ ArrayList<String> fname = new ArrayList<String>();
+ fname.add("count");
+ fname.add("xavg");
+ fname.add("yavg");
+ fname.add("xvar");
+ fname.add("yvar");
+ fname.add("covar");
+
+ partialResult = new Object[6];
+ partialResult[0] = new LongWritable(0);
+ partialResult[1] = new DoubleWritable(0);
+ partialResult[2] = new DoubleWritable(0);
+ partialResult[3] = new DoubleWritable(0);
+ partialResult[4] = new DoubleWritable(0);
+ partialResult[5] = new DoubleWritable(0);
+
+ return ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi);
+
+ } else {
+ setResult(new DoubleWritable(0));
+ return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+ }
+ }
+
+ static class StdAgg implements SerializableBuffer {
+ long count; // number n of elements
+ double xavg; // average of x elements
+ double yavg; // average of y elements
+ double xvar; // n times the variance of x elements
+ double yvar; // n times the variance of y elements
+ double covar; // n times the covariance
+
+ @Override
+ public void deSerializeAggBuffer(byte[] data, int start, int len) {
+ count = BufferSerDeUtil.getLong(data, start);
+ start += 8;
+ xavg = BufferSerDeUtil.getDouble(data, start);
+ start += 8;
+ yavg = BufferSerDeUtil.getDouble(data, start);
+ start += 8;
+ xvar = BufferSerDeUtil.getDouble(data, start);
+ start += 8;
+ yvar = BufferSerDeUtil.getDouble(data, start);
+ start += 8;
+ covar = BufferSerDeUtil.getDouble(data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(byte[] data, int start, int len) {
+ BufferSerDeUtil.writeLong(count, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(xavg, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(yavg, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(xvar, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(yvar, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(covar, data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(DataOutput output) throws IOException {
+ output.writeLong(count);
+ output.writeDouble(xavg);
+ output.writeDouble(yavg);
+ output.writeDouble(xvar);
+ output.writeDouble(yvar);
+ output.writeDouble(covar);
+ }
+ };
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ StdAgg result = new StdAgg();
+ reset(result);
+ return result;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ myagg.count = 0;
+ myagg.xavg = 0;
+ myagg.yavg = 0;
+ myagg.xvar = 0;
+ myagg.yvar = 0;
+ myagg.covar = 0;
+ }
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ assert (parameters.length == 2);
+ Object px = parameters[0];
+ Object py = parameters[1];
+ if (px != null && py != null) {
+ StdAgg myagg = (StdAgg) agg;
+ double vx = PrimitiveObjectInspectorUtils.getDouble(px, xInputOI);
+ double vy = PrimitiveObjectInspectorUtils.getDouble(py, yInputOI);
+ double xavgOld = myagg.xavg;
+ double yavgOld = myagg.yavg;
+ myagg.count++;
+ myagg.xavg += (vx - xavgOld) / myagg.count;
+ myagg.yavg += (vy - yavgOld) / myagg.count;
+ if (myagg.count > 1) {
+ myagg.covar += (vx - xavgOld) * (vy - myagg.yavg);
+ myagg.xvar += (vx - xavgOld) * (vx - myagg.xavg);
+ myagg.yvar += (vy - yavgOld) * (vy - myagg.yavg);
+ }
+ }
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ ((LongWritable) partialResult[0]).set(myagg.count);
+ ((DoubleWritable) partialResult[1]).set(myagg.xavg);
+ ((DoubleWritable) partialResult[2]).set(myagg.yavg);
+ ((DoubleWritable) partialResult[3]).set(myagg.xvar);
+ ((DoubleWritable) partialResult[4]).set(myagg.yvar);
+ ((DoubleWritable) partialResult[5]).set(myagg.covar);
+ return partialResult;
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ StdAgg myagg = (StdAgg) agg;
+
+ Object partialCount = soi.getStructFieldData(partial, countField);
+ Object partialXAvg = soi.getStructFieldData(partial, xavgField);
+ Object partialYAvg = soi.getStructFieldData(partial, yavgField);
+ Object partialXVar = soi.getStructFieldData(partial, xvarField);
+ Object partialYVar = soi.getStructFieldData(partial, yvarField);
+ Object partialCovar = soi.getStructFieldData(partial, covarField);
+
+ long nA = myagg.count;
+ long nB = countFieldOI.get(partialCount);
+
+ if (nA == 0) {
+ // Just copy the information since there is nothing so far
+ myagg.count = countFieldOI.get(partialCount);
+ myagg.xavg = xavgFieldOI.get(partialXAvg);
+ myagg.yavg = yavgFieldOI.get(partialYAvg);
+ myagg.xvar = xvarFieldOI.get(partialXVar);
+ myagg.yvar = yvarFieldOI.get(partialYVar);
+ myagg.covar = covarFieldOI.get(partialCovar);
+ }
+
+ if (nA != 0 && nB != 0) {
+ // Merge the two partials
+ double xavgA = myagg.xavg;
+ double yavgA = myagg.yavg;
+ double xavgB = xavgFieldOI.get(partialXAvg);
+ double yavgB = yavgFieldOI.get(partialYAvg);
+ double xvarB = xvarFieldOI.get(partialXVar);
+ double yvarB = yvarFieldOI.get(partialYVar);
+ double covarB = covarFieldOI.get(partialCovar);
+
+ myagg.count += nB;
+ myagg.xavg = (xavgA * nA + xavgB * nB) / myagg.count;
+ myagg.yavg = (yavgA * nA + yavgB * nB) / myagg.count;
+ myagg.xvar += xvarB + (xavgA - xavgB) * (xavgA - xavgB) * myagg.count;
+ myagg.yvar += yvarB + (yavgA - yavgB) * (yavgA - yavgB) * myagg.count;
+ myagg.covar += covarB + (xavgA - xavgB) * (yavgA - yavgB) * ((double) (nA * nB) / myagg.count);
+ }
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+
+ if (myagg.count < 2) { // SQL standard - return null for zero or one
+ // pair
+ return null;
+ } else {
+ getResult().set(myagg.covar / java.lang.Math.sqrt(myagg.xvar) / java.lang.Math.sqrt(myagg.yvar));
+ return getResult();
+ }
+ }
+
+ public void setResult(DoubleWritable result) {
+ this.result = result;
+ }
+
+ public DoubleWritable getResult() {
+ return result;
+ }
+ }
+
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java
new file mode 100644
index 0000000..dc5eef0
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java
@@ -0,0 +1,170 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+
+import edu.uci.ics.hivesterix.runtime.evaluator.BufferSerDeUtil;
+import edu.uci.ics.hivesterix.runtime.evaluator.SerializableBuffer;
+
+/**
+ * This class implements the COUNT aggregation function as in SQL.
+ */
+@Description(name = "count", value = "_FUNC_(*) - Returns the total number of retrieved rows, including "
+ + "rows containing NULL values.\n"
+
+ + "_FUNC_(expr) - Returns the number of rows for which the supplied " + "expression is non-NULL.\n"
+
+ + "_FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for "
+ + "which the supplied expression(s) are unique and non-NULL.")
+public class GenericUDAFCount implements GenericUDAFResolver2 {
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+ // This method implementation is preserved for backward compatibility.
+ return new GenericUDAFCountEvaluator();
+ }
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramInfo) throws SemanticException {
+
+ TypeInfo[] parameters = paramInfo.getParameters();
+
+ if (parameters.length == 0) {
+ if (!paramInfo.isAllColumns()) {
+ throw new UDFArgumentException("Argument expected");
+ }
+ assert !paramInfo.isDistinct() : "DISTINCT not supported with *";
+ } else {
+ if (parameters.length > 1 && !paramInfo.isDistinct()) {
+ throw new UDFArgumentException("DISTINCT keyword must be specified");
+ }
+ assert !paramInfo.isAllColumns() : "* not supported in expression list";
+ }
+
+ return new GenericUDAFCountEvaluator().setCountAllColumns(paramInfo.isAllColumns());
+ }
+
+ /**
+ * GenericUDAFCountEvaluator.
+ */
+ public static class GenericUDAFCountEvaluator extends GenericUDAFEvaluator {
+ private boolean countAllColumns = false;
+ private LongObjectInspector partialCountAggOI;
+ private LongWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ super.init(m, parameters);
+ partialCountAggOI = PrimitiveObjectInspectorFactory.writableLongObjectInspector;
+ result = new LongWritable(0);
+ return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
+ }
+
+ private GenericUDAFCountEvaluator setCountAllColumns(boolean countAllCols) {
+ countAllColumns = countAllCols;
+ return this;
+ }
+
+ /** class for storing count value. */
+ static class CountAgg implements SerializableBuffer {
+ long value;
+
+ @Override
+ public void deSerializeAggBuffer(byte[] data, int start, int len) {
+ value = BufferSerDeUtil.getLong(data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(byte[] data, int start, int len) {
+ BufferSerDeUtil.writeLong(value, data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(DataOutput output) throws IOException {
+ output.writeLong(value);
+ }
+ }
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ CountAgg buffer = new CountAgg();
+ reset(buffer);
+ return buffer;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ ((CountAgg) agg).value = 0;
+ }
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ // parameters == null means the input table/split is empty
+ if (parameters == null) {
+ return;
+ }
+ if (countAllColumns) {
+ assert parameters.length == 0;
+ ((CountAgg) agg).value++;
+ } else {
+ assert parameters.length > 0;
+ boolean countThisRow = true;
+ for (Object nextParam : parameters) {
+ if (nextParam == null) {
+ countThisRow = false;
+ break;
+ }
+ }
+ if (countThisRow) {
+ ((CountAgg) agg).value++;
+ }
+ }
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ long p = partialCountAggOI.get(partial);
+ ((CountAgg) agg).value += p;
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ result.set(((CountAgg) agg).value);
+ return result;
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ return terminate(agg);
+ }
+ }
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java
new file mode 100644
index 0000000..0c4448b
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCovariance.java
@@ -0,0 +1,341 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+
+import edu.uci.ics.hivesterix.runtime.evaluator.BufferSerDeUtil;
+import edu.uci.ics.hivesterix.runtime.evaluator.SerializableBuffer;
+
+/**
+ * Compute the covariance covar_pop(x, y), using the following one-pass method
+ * (ref. "Formulas for Robust, One-Pass Parallel Computation of Covariances and
+ * Arbitrary-Order Statistical Moments", Philippe Pebay, Sandia Labs):
+ * Incremental: n : <count> mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg> my_n =
+ * my_(n-1) + [y_n - my_(n-1)]/n : <yavg> c_n = c_(n-1) + (x_n - mx_(n-1))*(y_n
+ * - my_n) : <covariance * n>
+ * Merge: c_X = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/n_X
+ */
+@Description(name = "covariance,covar_pop", value = "_FUNC_(x,y) - Returns the population covariance of a set of number pairs", extended = "The function takes as arguments any pair of numeric types and returns a double.\n"
+ + "Any pair with a NULL is ignored. If the function is applied to an empty set, NULL\n"
+ + "will be returned. Otherwise, it computes the following:\n"
+ + " (SUM(x*y)-SUM(x)*SUM(y)/COUNT(x,y))/COUNT(x,y)\n" + "where neither x nor y is null.")
+public class GenericUDAFCovariance extends AbstractGenericUDAFResolver {
+
+ static final Log LOG = LogFactory.getLog(GenericUDAFCovariance.class.getName());
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+ if (parameters.length != 2) {
+ throw new UDFArgumentTypeException(parameters.length - 1, "Exactly two arguments are expected.");
+ }
+
+ if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+
+ if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(1, "Only primitive type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+
+ switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ switch (((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ return new GenericUDAFCovarianceEvaluator();
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(1, "Only numeric or string type arguments are accepted but "
+ + parameters[1].getTypeName() + " is passed.");
+ }
+ case STRING:
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(0, "Only numeric or string type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ }
+
+ /**
+ * Evaluate the variance using the algorithm described in
+ * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance,
+ * presumably by Pébay, Philippe (2008), in "Formulas for Robust, One-Pass
+ * Parallel Computation of Covariances and Arbitrary-Order Statistical
+ * Moments", Technical Report SAND2008-6212, Sandia National Laboratories,
+ * http://infoserve.sandia.gov/sand_doc/2008/086212.pdf
+ * Incremental: n : <count> mx_n = mx_(n-1) + [x_n - mx_(n-1)]/n : <xavg>
+ * my_n = my_(n-1) + [y_n - my_(n-1)]/n : <yavg> c_n = c_(n-1) + (x_n -
+ * mx_(n-1))*(y_n - my_n) : <covariance * n>
+ * Merge: c_X = c_A + c_B + (mx_A - mx_B)*(my_A - my_B)*n_A*n_B/n_X
+ * This one-pass algorithm is stable.
+ */
+ public static class GenericUDAFCovarianceEvaluator extends GenericUDAFEvaluator {
+
+ // For PARTIAL1 and COMPLETE
+ private PrimitiveObjectInspector xInputOI;
+ private PrimitiveObjectInspector yInputOI;
+
+ // For PARTIAL2 and FINAL
+ private StructObjectInspector soi;
+ private StructField countField;
+ private StructField xavgField;
+ private StructField yavgField;
+ private StructField covarField;
+ private LongObjectInspector countFieldOI;
+ private DoubleObjectInspector xavgFieldOI;
+ private DoubleObjectInspector yavgFieldOI;
+ private DoubleObjectInspector covarFieldOI;
+
+ // For PARTIAL1 and PARTIAL2
+ private Object[] partialResult;
+
+ // For FINAL and COMPLETE
+ private DoubleWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ super.init(m, parameters);
+
+ // init input
+ if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
+ assert (parameters.length == 2);
+ xInputOI = (PrimitiveObjectInspector) parameters[0];
+ yInputOI = (PrimitiveObjectInspector) parameters[1];
+ } else {
+ assert (parameters.length == 1);
+ soi = (StructObjectInspector) parameters[0];
+
+ countField = soi.getStructFieldRef("count");
+ xavgField = soi.getStructFieldRef("xavg");
+ yavgField = soi.getStructFieldRef("yavg");
+ covarField = soi.getStructFieldRef("covar");
+
+ countFieldOI = (LongObjectInspector) countField.getFieldObjectInspector();
+ xavgFieldOI = (DoubleObjectInspector) xavgField.getFieldObjectInspector();
+ yavgFieldOI = (DoubleObjectInspector) yavgField.getFieldObjectInspector();
+ covarFieldOI = (DoubleObjectInspector) covarField.getFieldObjectInspector();
+ }
+
+ // init output
+ if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
+ // The output of a partial aggregation is a struct containing
+ // a long count, two double averages, and a double covariance.
+
+ ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>();
+
+ foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+
+ ArrayList<String> fname = new ArrayList<String>();
+ fname.add("count");
+ fname.add("xavg");
+ fname.add("yavg");
+ fname.add("covar");
+
+ partialResult = new Object[4];
+ partialResult[0] = new LongWritable(0);
+ partialResult[1] = new DoubleWritable(0);
+ partialResult[2] = new DoubleWritable(0);
+ partialResult[3] = new DoubleWritable(0);
+
+ return ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi);
+
+ } else {
+ setResult(new DoubleWritable(0));
+ return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+ }
+ }
+
+ static class StdAgg implements SerializableBuffer {
+ long count; // number n of elements
+ double xavg; // average of x elements
+ double yavg; // average of y elements
+ double covar; // n times the covariance
+
+ @Override
+ public void deSerializeAggBuffer(byte[] data, int start, int len) {
+ count = BufferSerDeUtil.getLong(data, start);
+ start += 8;
+ xavg = BufferSerDeUtil.getDouble(data, start);
+ start += 8;
+ yavg = BufferSerDeUtil.getDouble(data, start);
+ start += 8;
+ covar = BufferSerDeUtil.getDouble(data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(byte[] data, int start, int len) {
+ BufferSerDeUtil.writeLong(count, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(xavg, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(yavg, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(covar, data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(DataOutput output) throws IOException {
+ output.writeLong(count);
+ output.writeDouble(xavg);
+ output.writeDouble(yavg);
+ output.writeDouble(covar);
+ }
+ };
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ StdAgg result = new StdAgg();
+ reset(result);
+ return result;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ myagg.count = 0;
+ myagg.xavg = 0;
+ myagg.yavg = 0;
+ myagg.covar = 0;
+ }
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ assert (parameters.length == 2);
+ Object px = parameters[0];
+ Object py = parameters[1];
+ if (px != null && py != null) {
+ StdAgg myagg = (StdAgg) agg;
+ double vx = PrimitiveObjectInspectorUtils.getDouble(px, xInputOI);
+ double vy = PrimitiveObjectInspectorUtils.getDouble(py, yInputOI);
+ myagg.count++;
+ myagg.yavg = myagg.yavg + (vy - myagg.yavg) / myagg.count;
+ if (myagg.count > 1) {
+ myagg.covar += (vx - myagg.xavg) * (vy - myagg.yavg);
+ }
+ myagg.xavg = myagg.xavg + (vx - myagg.xavg) / myagg.count;
+ }
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ ((LongWritable) partialResult[0]).set(myagg.count);
+ ((DoubleWritable) partialResult[1]).set(myagg.xavg);
+ ((DoubleWritable) partialResult[2]).set(myagg.yavg);
+ ((DoubleWritable) partialResult[3]).set(myagg.covar);
+ return partialResult;
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ StdAgg myagg = (StdAgg) agg;
+
+ Object partialCount = soi.getStructFieldData(partial, countField);
+ Object partialXAvg = soi.getStructFieldData(partial, xavgField);
+ Object partialYAvg = soi.getStructFieldData(partial, yavgField);
+ Object partialCovar = soi.getStructFieldData(partial, covarField);
+
+ long nA = myagg.count;
+ long nB = countFieldOI.get(partialCount);
+
+ if (nA == 0) {
+ // Just copy the information since there is nothing so far
+ myagg.count = countFieldOI.get(partialCount);
+ myagg.xavg = xavgFieldOI.get(partialXAvg);
+ myagg.yavg = yavgFieldOI.get(partialYAvg);
+ myagg.covar = covarFieldOI.get(partialCovar);
+ }
+
+ if (nA != 0 && nB != 0) {
+ // Merge the two partials
+ double xavgA = myagg.xavg;
+ double yavgA = myagg.yavg;
+ double xavgB = xavgFieldOI.get(partialXAvg);
+ double yavgB = yavgFieldOI.get(partialYAvg);
+ double covarB = covarFieldOI.get(partialCovar);
+
+ myagg.count += nB;
+ myagg.xavg = (xavgA * nA + xavgB * nB) / myagg.count;
+ myagg.yavg = (yavgA * nA + yavgB * nB) / myagg.count;
+ myagg.covar += covarB + (xavgA - xavgB) * (yavgA - yavgB) * ((double) (nA * nB) / myagg.count);
+ }
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+
+ if (myagg.count == 0) { // SQL standard - return null for zero
+ // elements
+ return null;
+ } else {
+ getResult().set(myagg.covar / (myagg.count));
+ return getResult();
+ }
+ }
+
+ public void setResult(DoubleWritable result) {
+ this.result = result;
+ }
+
+ public DoubleWritable getResult() {
+ return result;
+ }
+ }
+
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java
new file mode 100644
index 0000000..afdc397
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java
@@ -0,0 +1,272 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.util.StringUtils;
+
+import edu.uci.ics.hivesterix.runtime.evaluator.BufferSerDeUtil;
+import edu.uci.ics.hivesterix.runtime.evaluator.SerializableBuffer;
+
+/**
+ * GenericUDAFSum.
+ */
+@Description(name = "sum", value = "_FUNC_(x) - Returns the sum of a set of numbers")
+public class GenericUDAFSum extends AbstractGenericUDAFResolver {
+
+ static final Log LOG = LogFactory.getLog(GenericUDAFSum.class.getName());
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+ if (parameters.length != 1) {
+ throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected.");
+ }
+
+ if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new GenericUDAFSumLong();
+ case FLOAT:
+ case DOUBLE:
+ case STRING:
+ return new GenericUDAFSumDouble();
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(0, "Only numeric or string type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ }
+
+ /**
+ * GenericUDAFSumDouble.
+ */
+ public static class GenericUDAFSumDouble extends GenericUDAFEvaluator {
+ private PrimitiveObjectInspector inputOI;
+ private DoubleWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ super.init(m, parameters);
+ result = new DoubleWritable(0);
+ inputOI = (PrimitiveObjectInspector) parameters[0];
+ return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+ }
+
+ /** class for storing double sum value. */
+ static class SumDoubleAgg implements SerializableBuffer {
+ boolean empty;
+ double sum;
+
+ @Override
+ public void deSerializeAggBuffer(byte[] data, int start, int len) {
+ empty = BufferSerDeUtil.getBoolean(data, start);
+ start += 1;
+ sum = BufferSerDeUtil.getDouble(data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(byte[] data, int start, int len) {
+ BufferSerDeUtil.writeBoolean(empty, data, start);
+ start += 1;
+ BufferSerDeUtil.writeDouble(sum, data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(DataOutput output) throws IOException {
+ output.writeBoolean(empty);
+ output.writeDouble(sum);
+ }
+ }
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ SumDoubleAgg result = new SumDoubleAgg();
+ reset(result);
+ return result;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ SumDoubleAgg myagg = (SumDoubleAgg) agg;
+ myagg.empty = true;
+ myagg.sum = 0;
+ }
+
+ boolean warned = false;
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ try {
+ merge(agg, parameters[0]);
+ } catch (NumberFormatException e) {
+ if (!warned) {
+ warned = true;
+ LOG.warn(getClass().getSimpleName() + " " + StringUtils.stringifyException(e));
+ LOG.warn(getClass().getSimpleName() + " ignoring similar exceptions.");
+ }
+ }
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ return terminate(agg);
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ SumDoubleAgg myagg = (SumDoubleAgg) agg;
+ myagg.empty = false;
+ myagg.sum += PrimitiveObjectInspectorUtils.getDouble(partial, inputOI);
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ SumDoubleAgg myagg = (SumDoubleAgg) agg;
+ if (myagg.empty) {
+ return null;
+ }
+ result.set(myagg.sum);
+ return result;
+ }
+
+ }
+
+ /**
+ * GenericUDAFSumLong.
+ */
+ public static class GenericUDAFSumLong extends GenericUDAFEvaluator {
+ private PrimitiveObjectInspector inputOI;
+ private LongWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ super.init(m, parameters);
+ result = new LongWritable(0);
+ inputOI = (PrimitiveObjectInspector) parameters[0];
+ return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
+ }
+
+ /** class for storing double sum value. */
+ static class SumLongAgg implements SerializableBuffer {
+ boolean empty;
+ long sum;
+
+ @Override
+ public void deSerializeAggBuffer(byte[] data, int start, int len) {
+ empty = BufferSerDeUtil.getBoolean(data, start);
+ start += 1;
+ sum = BufferSerDeUtil.getLong(data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(byte[] data, int start, int len) {
+ BufferSerDeUtil.writeBoolean(empty, data, start);
+ start += 1;
+ BufferSerDeUtil.writeLong(sum, data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(DataOutput output) throws IOException {
+ output.writeBoolean(empty);
+ output.writeLong(sum);
+ }
+ }
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ SumLongAgg result = new SumLongAgg();
+ reset(result);
+ return result;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ SumLongAgg myagg = (SumLongAgg) agg;
+ myagg.empty = true;
+ myagg.sum = 0;
+ }
+
+ private boolean warned = false;
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ try {
+ merge(agg, parameters[0]);
+ } catch (NumberFormatException e) {
+ if (!warned) {
+ warned = true;
+ LOG.warn(getClass().getSimpleName() + " " + StringUtils.stringifyException(e));
+ }
+ }
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ return terminate(agg);
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ SumLongAgg myagg = (SumLongAgg) agg;
+ myagg.sum += PrimitiveObjectInspectorUtils.getLong(partial, inputOI);
+ myagg.empty = false;
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ SumLongAgg myagg = (SumLongAgg) agg;
+ if (myagg.empty) {
+ return null;
+ }
+ result.set(myagg.sum);
+ return result;
+ }
+
+ }
+
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java
new file mode 100644
index 0000000..e839008
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java
@@ -0,0 +1,305 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.util.StringUtils;
+
+import edu.uci.ics.hivesterix.runtime.evaluator.BufferSerDeUtil;
+import edu.uci.ics.hivesterix.runtime.evaluator.SerializableBuffer;
+
+/**
+ * Compute the variance. This class is extended by: GenericUDAFVarianceSample
+ * GenericUDAFStd GenericUDAFStdSample
+ */
+@Description(name = "variance,var_pop", value = "_FUNC_(x) - Returns the variance of a set of numbers")
+public class GenericUDAFVariance extends AbstractGenericUDAFResolver {
+
+ static final Log LOG = LogFactory.getLog(GenericUDAFVariance.class.getName());
+
+ @Override
+ public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
+ if (parameters.length != 1) {
+ throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected.");
+ }
+
+ if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case STRING:
+ return new GenericUDAFVarianceEvaluator();
+ case BOOLEAN:
+ default:
+ throw new UDFArgumentTypeException(0, "Only numeric or string type arguments are accepted but "
+ + parameters[0].getTypeName() + " is passed.");
+ }
+ }
+
+ /**
+ * Evaluate the variance using the algorithm described by Chan, Golub, and
+ * LeVeque in
+ * "Algorithms for computing the sample variance: analysis and recommendations"
+ * The American Statistician, 37 (1983) pp. 242--247.
+ * variance = variance1 + variance2 + n/(m*(m+n)) * pow(((m/n)*t1 - t2),2)
+ * where: - variance is sum[x-avg^2] (this is actually n times the variance)
+ * and is updated at every step. - n is the count of elements in chunk1 - m
+ * is the count of elements in chunk2 - t1 = sum of elements in chunk1, t2 =
+ * sum of elements in chunk2.
+ * This algorithm was proven to be numerically stable by J.L. Barlow in
+ * "Error analysis of a pairwise summation algorithm to compute sample variance"
+ * Numer. Math, 58 (1991) pp. 583--590
+ */
+ public static class GenericUDAFVarianceEvaluator extends GenericUDAFEvaluator {
+
+ // For PARTIAL1 and COMPLETE
+ private PrimitiveObjectInspector inputOI;
+
+ // For PARTIAL2 and FINAL
+ private StructObjectInspector soi;
+ private StructField countField;
+ private StructField sumField;
+ private StructField varianceField;
+ private LongObjectInspector countFieldOI;
+ private DoubleObjectInspector sumFieldOI;
+
+ // For PARTIAL1 and PARTIAL2
+ private Object[] partialResult;
+
+ // For FINAL and COMPLETE
+ private DoubleWritable result;
+
+ @Override
+ public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ super.init(m, parameters);
+
+ // init input
+ if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
+ inputOI = (PrimitiveObjectInspector) parameters[0];
+ } else {
+ soi = (StructObjectInspector) parameters[0];
+
+ countField = soi.getStructFieldRef("count");
+ sumField = soi.getStructFieldRef("sum");
+ varianceField = soi.getStructFieldRef("variance");
+
+ countFieldOI = (LongObjectInspector) countField.getFieldObjectInspector();
+ sumFieldOI = (DoubleObjectInspector) sumField.getFieldObjectInspector();
+ }
+
+ // init output
+ if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
+ // The output of a partial aggregation is a struct containing
+ // a long count and doubles sum and variance.
+
+ ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>();
+
+ foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+ foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
+
+ ArrayList<String> fname = new ArrayList<String>();
+ fname.add("count");
+ fname.add("sum");
+ fname.add("variance");
+
+ partialResult = new Object[3];
+ partialResult[0] = new LongWritable(0);
+ partialResult[1] = new DoubleWritable(0);
+ partialResult[2] = new DoubleWritable(0);
+
+ return ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi);
+
+ } else {
+ setResult(new DoubleWritable(0));
+ return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+ }
+ }
+
+ static class StdAgg implements SerializableBuffer {
+ long count; // number of elements
+ double sum; // sum of elements
+ double variance; // sum[x-avg^2] (this is actually n times the
+ // variance)
+
+ @Override
+ public void deSerializeAggBuffer(byte[] data, int start, int len) {
+ count = BufferSerDeUtil.getLong(data, start);
+ start += 8;
+ sum = BufferSerDeUtil.getDouble(data, start);
+ start += 8;
+ variance = BufferSerDeUtil.getDouble(data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(byte[] data, int start, int len) {
+ BufferSerDeUtil.writeLong(count, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(sum, data, start);
+ start += 8;
+ BufferSerDeUtil.writeDouble(variance, data, start);
+ }
+
+ @Override
+ public void serializeAggBuffer(DataOutput output) throws IOException {
+ output.writeLong(count);
+ output.writeDouble(sum);
+ output.writeDouble(variance);
+ }
+ };
+
+ @Override
+ public AggregationBuffer getNewAggregationBuffer() throws HiveException {
+ StdAgg result = new StdAgg();
+ reset(result);
+ return result;
+ }
+
+ @Override
+ public void reset(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ myagg.count = 0;
+ myagg.sum = 0;
+ myagg.variance = 0;
+ }
+
+ private boolean warned = false;
+
+ @Override
+ public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
+ assert (parameters.length == 1);
+ Object p = parameters[0];
+ if (p != null) {
+ StdAgg myagg = (StdAgg) agg;
+ try {
+ double v = PrimitiveObjectInspectorUtils.getDouble(p, inputOI);
+ myagg.count++;
+ myagg.sum += v;
+ if (myagg.count > 1) {
+ double t = myagg.count * v - myagg.sum;
+ myagg.variance += (t * t) / ((double) myagg.count * (myagg.count - 1));
+ }
+ } catch (NumberFormatException e) {
+ if (!warned) {
+ warned = true;
+ LOG.warn(getClass().getSimpleName() + " " + StringUtils.stringifyException(e));
+ LOG.warn(getClass().getSimpleName() + " ignoring similar exceptions.");
+ }
+ }
+ }
+ }
+
+ @Override
+ public Object terminatePartial(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+ ((LongWritable) partialResult[0]).set(myagg.count);
+ ((DoubleWritable) partialResult[1]).set(myagg.sum);
+ ((DoubleWritable) partialResult[2]).set(myagg.variance);
+ return partialResult;
+ }
+
+ @Override
+ public void merge(AggregationBuffer agg, Object partial) throws HiveException {
+ if (partial != null) {
+ StdAgg myagg = (StdAgg) agg;
+
+ Object partialCount = soi.getStructFieldData(partial, countField);
+ Object partialSum = soi.getStructFieldData(partial, sumField);
+ Object partialVariance = soi.getStructFieldData(partial, varianceField);
+
+ long n = myagg.count;
+ long m = countFieldOI.get(partialCount);
+
+ if (n == 0) {
+ // Just copy the information since there is nothing so far
+ myagg.variance = sumFieldOI.get(partialVariance);
+ myagg.count = countFieldOI.get(partialCount);
+ myagg.sum = sumFieldOI.get(partialSum);
+ }
+
+ if (m != 0 && n != 0) {
+ // Merge the two partials
+
+ double a = myagg.sum;
+ double b = sumFieldOI.get(partialSum);
+
+ myagg.count += m;
+ myagg.sum += b;
+ double t = (m / (double) n) * a - b;
+ myagg.variance += sumFieldOI.get(partialVariance) + ((n / (double) m) / ((double) n + m)) * t * t;
+ }
+ }
+ }
+
+ @Override
+ public Object terminate(AggregationBuffer agg) throws HiveException {
+ StdAgg myagg = (StdAgg) agg;
+
+ if (myagg.count == 0) { // SQL standard - return null for zero
+ // elements
+ return null;
+ } else {
+ if (myagg.count > 1) {
+ getResult().set(myagg.variance / (myagg.count));
+ } else { // for one element the variance is always 0
+ getResult().set(0);
+ }
+ return getResult();
+ }
+ }
+
+ public void setResult(DoubleWritable result) {
+ this.result = result;
+ }
+
+ public DoubleWritable getResult() {
+ return result;
+ }
+ }
+
+}
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java
new file mode 100644
index 0000000..7920001
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/serde2/lazy/objectinspector/LazyObjectInspectorFactory.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.lazy.objectinspector;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.io.Text;
+
+/**
+ * ObjectInspectorFactory is the primary way to create new ObjectInspector
+ * instances.
+ * SerDe classes should call the static functions in this library to create an
+ * ObjectInspector to return to the caller of SerDe2.getObjectInspector().
+ * The reason of having caches here is that ObjectInspectors do not have an
+ * internal state - so ObjectInspectors with the same construction parameters
+ * should result in exactly the same ObjectInspector.
+ */
+public final class LazyObjectInspectorFactory {
+
+ static ConcurrentHashMap<ArrayList<Object>, LazySimpleStructObjectInspector> cachedLazySimpleStructObjectInspector = new ConcurrentHashMap<ArrayList<Object>, LazySimpleStructObjectInspector>();
+
+ public static LazySimpleStructObjectInspector getLazySimpleStructObjectInspector(List<String> structFieldNames,
+ List<ObjectInspector> structFieldObjectInspectors, byte separator, Text nullSequence,
+ boolean lastColumnTakesRest, boolean escaped, byte escapeChar) {
+ ArrayList<Object> signature = new ArrayList<Object>();
+ signature.add(structFieldNames);
+ signature.add(structFieldObjectInspectors);
+ signature.add(Byte.valueOf(separator));
+ signature.add(nullSequence.toString());
+ signature.add(Boolean.valueOf(lastColumnTakesRest));
+ signature.add(Boolean.valueOf(escaped));
+ signature.add(Byte.valueOf(escapeChar));
+ LazySimpleStructObjectInspector result = cachedLazySimpleStructObjectInspector.get(signature);
+ if (result == null) {
+ result = new LazySimpleStructObjectInspector(structFieldNames, structFieldObjectInspectors, separator,
+ nullSequence, lastColumnTakesRest, escaped, escapeChar);
+ cachedLazySimpleStructObjectInspector.put(signature, result);
+ }
+ return result;
+ }
+
+ static ConcurrentHashMap<ArrayList<Object>, LazyListObjectInspector> cachedLazySimpleListObjectInspector = new ConcurrentHashMap<ArrayList<Object>, LazyListObjectInspector>();
+
+ public static LazyListObjectInspector getLazySimpleListObjectInspector(ObjectInspector listElementObjectInspector,
+ byte separator, Text nullSequence, boolean escaped, byte escapeChar) {
+ ArrayList<Object> signature = new ArrayList<Object>();
+ signature.add(listElementObjectInspector);
+ signature.add(Byte.valueOf(separator));
+ signature.add(nullSequence.toString());
+ signature.add(Boolean.valueOf(escaped));
+ signature.add(Byte.valueOf(escapeChar));
+ LazyListObjectInspector result = cachedLazySimpleListObjectInspector.get(signature);
+ if (result == null) {
+ result = new LazyListObjectInspector(listElementObjectInspector, separator, nullSequence, escaped,
+ escapeChar);
+ cachedLazySimpleListObjectInspector.put(signature, result);
+ }
+ return result;
+ }
+
+ static ConcurrentHashMap<ArrayList<Object>, LazyMapObjectInspector> cachedLazySimpleMapObjectInspector = new ConcurrentHashMap<ArrayList<Object>, LazyMapObjectInspector>();
+
+ public static LazyMapObjectInspector getLazySimpleMapObjectInspector(ObjectInspector mapKeyObjectInspector,
+ ObjectInspector mapValueObjectInspector, byte itemSeparator, byte keyValueSeparator, Text nullSequence,
+ boolean escaped, byte escapeChar) {
+ ArrayList<Object> signature = new ArrayList<Object>();
+ signature.add(mapKeyObjectInspector);
+ signature.add(mapValueObjectInspector);
+ signature.add(Byte.valueOf(itemSeparator));
+ signature.add(Byte.valueOf(keyValueSeparator));
+ signature.add(nullSequence.toString());
+ signature.add(Boolean.valueOf(escaped));
+ signature.add(Byte.valueOf(escapeChar));
+ LazyMapObjectInspector result = cachedLazySimpleMapObjectInspector.get(signature);
+ if (result == null) {
+ result = new LazyMapObjectInspector(mapKeyObjectInspector, mapValueObjectInspector, itemSeparator,
+ keyValueSeparator, nullSequence, escaped, escapeChar);
+ cachedLazySimpleMapObjectInspector.put(signature, result);
+ }
+ return result;
+ }
+
+ static ConcurrentHashMap<List<Object>, LazyUnionObjectInspector> cachedLazyUnionObjectInspector = new ConcurrentHashMap<List<Object>, LazyUnionObjectInspector>();
+
+ public static LazyUnionObjectInspector getLazyUnionObjectInspector(List<ObjectInspector> ois, byte separator,
+ Text nullSequence, boolean escaped, byte escapeChar) {
+ List<Object> signature = new ArrayList<Object>();
+ signature.add(ois);
+ signature.add(Byte.valueOf(separator));
+ signature.add(nullSequence.toString());
+ signature.add(Boolean.valueOf(escaped));
+ signature.add(Byte.valueOf(escapeChar));
+ LazyUnionObjectInspector result = cachedLazyUnionObjectInspector.get(signature);
+ if (result == null) {
+ result = new LazyUnionObjectInspector(ois, separator, nullSequence, escaped, escapeChar);
+ cachedLazyUnionObjectInspector.put(signature, result);
+ }
+ return result;
+ }
+
+ private LazyObjectInspectorFactory() {
+ // prevent instantiation
+ }
+}
\ No newline at end of file
diff --git a/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java
new file mode 100644
index 0000000..95b999e
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.typeinfo;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.hadoop.hive.serde.Constants;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+
+/**
+ * TypeInfoFactory can be used to create the TypeInfo object for any types.
+ * TypeInfo objects are all read-only so we can reuse them easily.
+ * TypeInfoFactory has internal cache to make sure we don't create 2 TypeInfo
+ * objects that represents the same type.
+ */
+public final class TypeInfoFactory {
+
+ static ConcurrentHashMap<String, TypeInfo> cachedPrimitiveTypeInfo = new ConcurrentHashMap<String, TypeInfo>();
+
+ private TypeInfoFactory() {
+ // prevent instantiation
+ }
+
+ public static TypeInfo getPrimitiveTypeInfo(String typeName) {
+ if (null == PrimitiveObjectInspectorUtils.getTypeEntryFromTypeName(typeName)) {
+ throw new RuntimeException("Cannot getPrimitiveTypeInfo for " + typeName);
+ }
+ TypeInfo result = cachedPrimitiveTypeInfo.get(typeName);
+ if (result == null) {
+ result = new PrimitiveTypeInfo(typeName);
+ cachedPrimitiveTypeInfo.put(typeName, result);
+ }
+ return result;
+ }
+
+ public static final TypeInfo voidTypeInfo = getPrimitiveTypeInfo(Constants.VOID_TYPE_NAME);
+ public static final TypeInfo booleanTypeInfo = getPrimitiveTypeInfo(Constants.BOOLEAN_TYPE_NAME);
+ public static final TypeInfo intTypeInfo = getPrimitiveTypeInfo(Constants.INT_TYPE_NAME);
+ public static final TypeInfo longTypeInfo = getPrimitiveTypeInfo(Constants.BIGINT_TYPE_NAME);
+ public static final TypeInfo stringTypeInfo = getPrimitiveTypeInfo(Constants.STRING_TYPE_NAME);
+ public static final TypeInfo floatTypeInfo = getPrimitiveTypeInfo(Constants.FLOAT_TYPE_NAME);
+ public static final TypeInfo doubleTypeInfo = getPrimitiveTypeInfo(Constants.DOUBLE_TYPE_NAME);
+ public static final TypeInfo byteTypeInfo = getPrimitiveTypeInfo(Constants.TINYINT_TYPE_NAME);
+ public static final TypeInfo shortTypeInfo = getPrimitiveTypeInfo(Constants.SMALLINT_TYPE_NAME);
+
+ public static final TypeInfo unknownTypeInfo = getPrimitiveTypeInfo("unknown");
+
+ public static TypeInfo getPrimitiveTypeInfoFromPrimitiveWritable(Class<?> clazz) {
+ String typeName = PrimitiveObjectInspectorUtils.getTypeNameFromPrimitiveWritable(clazz);
+ if (typeName == null) {
+ throw new RuntimeException("Internal error: Cannot get typeName for " + clazz);
+ }
+ return getPrimitiveTypeInfo(typeName);
+ }
+
+ public static TypeInfo getPrimitiveTypeInfoFromJavaPrimitive(Class<?> clazz) {
+ return getPrimitiveTypeInfo(PrimitiveObjectInspectorUtils.getTypeNameFromPrimitiveJava(clazz));
+ }
+
+ static ConcurrentHashMap<ArrayList<List<?>>, TypeInfo> cachedStructTypeInfo = new ConcurrentHashMap<ArrayList<List<?>>, TypeInfo>();
+
+ public static TypeInfo getStructTypeInfo(List<String> names, List<TypeInfo> typeInfos) {
+ ArrayList<List<?>> signature = new ArrayList<List<?>>(2);
+ signature.add(names);
+ signature.add(typeInfos);
+ TypeInfo result = cachedStructTypeInfo.get(signature);
+ if (result == null) {
+ result = new StructTypeInfo(names, typeInfos);
+ cachedStructTypeInfo.put(signature, result);
+ }
+ return result;
+ }
+
+ static ConcurrentHashMap<List<?>, TypeInfo> cachedUnionTypeInfo = new ConcurrentHashMap<List<?>, TypeInfo>();
+
+ public static TypeInfo getUnionTypeInfo(List<TypeInfo> typeInfos) {
+ TypeInfo result = cachedUnionTypeInfo.get(typeInfos);
+ if (result == null) {
+ result = new UnionTypeInfo(typeInfos);
+ cachedUnionTypeInfo.put(typeInfos, result);
+ }
+ return result;
+ }
+
+ static ConcurrentHashMap<TypeInfo, TypeInfo> cachedListTypeInfo = new ConcurrentHashMap<TypeInfo, TypeInfo>();
+
+ public static TypeInfo getListTypeInfo(TypeInfo elementTypeInfo) {
+ TypeInfo result = cachedListTypeInfo.get(elementTypeInfo);
+ if (result == null) {
+ result = new ListTypeInfo(elementTypeInfo);
+ cachedListTypeInfo.put(elementTypeInfo, result);
+ }
+ return result;
+ }
+
+ static ConcurrentHashMap<ArrayList<TypeInfo>, TypeInfo> cachedMapTypeInfo = new ConcurrentHashMap<ArrayList<TypeInfo>, TypeInfo>();
+
+ public static TypeInfo getMapTypeInfo(TypeInfo keyTypeInfo, TypeInfo valueTypeInfo) {
+ ArrayList<TypeInfo> signature = new ArrayList<TypeInfo>(2);
+ signature.add(keyTypeInfo);
+ signature.add(valueTypeInfo);
+ TypeInfo result = cachedMapTypeInfo.get(signature);
+ if (result == null) {
+ result = new MapTypeInfo(keyTypeInfo, valueTypeInfo);
+ cachedMapTypeInfo.put(signature, result);
+ }
+ return result;
+ };
+
+}
\ No newline at end of file
diff --git a/hivesterix/hivesterix-dist/src/main/resources/conf/cluster.properties b/hivesterix/hivesterix-dist/src/main/resources/conf/cluster.properties
new file mode 100644
index 0000000..2d2401a
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/conf/cluster.properties
@@ -0,0 +1,37 @@
+#The CC port for Hyracks clients
+CC_CLIENTPORT=3099
+
+#The CC port for Hyracks cluster management
+CC_CLUSTERPORT=1099
+
+#The directory of hyracks binaries
+HYRACKS_HOME=../../../../hyracks
+
+#The tmp directory for cc to install jars
+CCTMP_DIR=/tmp/t1
+
+#The tmp directory for nc to install jars
+NCTMP_DIR=/tmp/t2
+
+#The directory to put cc logs
+CCLOGS_DIR=$CCTMP_DIR/logs
+
+#The directory to put nc logs
+NCLOGS_DIR=$NCTMP_DIR/logs
+
+#Comma separated I/O directories for the spilling of external sort
+IO_DIRS="/tmp/t3,/tmp/t4"
+
+#The JAVA_HOME
+JAVA_HOME=$JAVA_HOME
+
+#The frame size of the internal dataflow engine
+FRAME_SIZE=65536
+
+#CC JAVA_OPTS
+CCJAVA_OPTS="-Xdebug -Xrunjdwp:transport=dt_socket,address=7001,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+# Yourkit option: -agentpath:/grid/0/dev/vborkar/tools/yjp-10.0.4/bin/linux-x86-64/libyjpagent.so=port=20001"
+
+#NC JAVA_OPTS
+NCJAVA_OPTS="-Xdebug -Xrunjdwp:transport=dt_socket,address=7002,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/conf/configuration.xsl b/hivesterix/hivesterix-dist/src/main/resources/conf/configuration.xsl
new file mode 100644
index 0000000..377cdbe
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/conf/configuration.xsl
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<xsl:output method="html"/>
+<xsl:template match="configuration">
+<html>
+<body>
+<table border="1">
+<tr>
+ <td>name</td>
+ <td>value</td>
+ <td>description</td>
+</tr>
+<xsl:for-each select="property">
+<tr>
+ <td><a name="{name}"><xsl:value-of select="name"/></a></td>
+ <td><xsl:value-of select="value"/></td>
+ <td><xsl:value-of select="description"/></td>
+</tr>
+</xsl:for-each>
+</table>
+</body>
+</html>
+</xsl:template>
+</xsl:stylesheet>
diff --git a/hivesterix/hivesterix-dist/src/main/resources/conf/debugnc.properties b/hivesterix/hivesterix-dist/src/main/resources/conf/debugnc.properties
new file mode 100755
index 0000000..27afa26
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/conf/debugnc.properties
@@ -0,0 +1,12 @@
+#The tmp directory for nc to install jars
+NCTMP_DIR2=/tmp/t-1
+
+#The directory to put nc logs
+NCLOGS_DIR2=$NCTMP_DIR/logs
+
+#Comma separated I/O directories for the spilling of external sort
+IO_DIRS2="/tmp/t-2,/tmp/t-3"
+
+#NC JAVA_OPTS
+NCJAVA_OPTS2="-Xdebug -Xrunjdwp:transport=dt_socket,address=7003,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/conf/hive-default.xml b/hivesterix/hivesterix-dist/src/main/resources/conf/hive-default.xml
new file mode 100644
index 0000000..587eede
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/conf/hive-default.xml
@@ -0,0 +1,758 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+
+ <!-- Hive Configuration can either be stored in this file or in the hadoop
+ configuration files -->
+ <!-- that are implied by Hadoop setup variables. -->
+ <!-- Aside from Hadoop setup variables - this file is provided as a convenience
+ so that Hive -->
+ <!-- users do not have to edit hadoop configuration files (that may be managed
+ as a centralized -->
+ <!-- resource). -->
+
+ <!-- Hive Execution Parameters -->
+ <property>
+ <name>mapred.reduce.tasks</name>
+ <value>-1</value>
+ <description>The default number of reduce tasks per job. Typically set
+ to a prime close to the number of available hosts. Ignored when
+ mapred.job.tracker is "local". Hadoop set this to 1 by default,
+ whereas hive uses -1 as its default value.
+ By setting this property to -1, Hive will automatically figure out what
+ should be the number of reducers.
+ </description>
+
+ <property>
+ <name>hive.hyracks.connectorpolicy</name>
+ <value>PIPELINING</value>
+ </property>
+
+ <property>
+ <name>hive.hyracks.parrallelism</name>
+ <value>4</value>
+ </property>
+
+ <property>
+ <name>hive.algebricks.groupby.external</name>
+ <value>true</value>
+ </property>
+
+ <property>
+ <name>hive.algebricks.groupby.external.memory</name>
+ <value>33554432</value>
+ </property>
+
+ <property>
+ <name>hive.algebricks.sort.memory</name>
+ <value>33554432</value>
+ </property>
+
+ <property>
+ <name>hive.exec.reducers.bytes.per.reducer</name>
+ <value>1000000000</value>
+ <description>size per reducer.The default is 1G, i.e if the input size
+ is 10G, it will use 10 reducers.</description>
+ </property>
+
+ <property>
+ <name>hive.exec.reducers.max</name>
+ <value>999</value>
+ <description>max number of reducers will be used. If the one
+ specified in the configuration parameter mapred.reduce.tasks is
+ negative, hive will use this one as the max number of reducers when
+ automatically determine number of reducers.</description>
+ </property>
+
+ <property>
+ <name>hive.exec.scratchdir</name>
+ <value>/hive-${user.name}</value>
+ <description>Scratch space for Hive jobs</description>
+ </property>
+
+ <property>
+ <name>hive.test.mode</name>
+ <value>false</value>
+ <description>whether hive is running in test mode. If yes, it turns on
+ sampling and prefixes the output tablename</description>
+ </property>
+
+ <property>
+ <name>hive.test.mode.prefix</name>
+ <value>test_</value>
+ <description>if hive is running in test mode, prefixes the output
+ table by this string</description>
+ </property>
+
+ <!-- If the input table is not bucketed, the denominator of the tablesample
+ is determinied by the parameter below -->
+ <!-- For example, the following query: -->
+ <!-- INSERT OVERWRITE TABLE dest -->
+ <!-- SELECT col1 from src -->
+ <!-- would be converted to -->
+ <!-- INSERT OVERWRITE TABLE test_dest -->
+ <!-- SELECT col1 from src TABLESAMPLE (BUCKET 1 out of 32 on rand(1)) -->
+ <property>
+ <name>hive.test.mode.samplefreq</name>
+ <value>32</value>
+ <description>if hive is running in test mode and table is not
+ bucketed, sampling frequency</description>
+ </property>
+
+ <property>
+ <name>hive.test.mode.nosamplelist</name>
+ <value></value>
+ <description>if hive is running in test mode, dont sample the above
+ comma seperated list of tables</description>
+ </property>
+
+ <property>
+ <name>hive.metastore.local</name>
+ <value>true</value>
+ <description>controls whether to connect to remove metastore server or
+ open a new metastore server in Hive Client JVM</description>
+ </property>
+
+ <property>
+ <name>javax.jdo.option.ConnectionURL</name>
+ <value>jdbc:derby:;databaseName=metastore_db;create=true</value>
+ <description>JDBC connect string for a JDBC metastore</description>
+ </property>
+
+ <property>
+ <name>javax.jdo.option.ConnectionDriverName</name>
+ <value>org.apache.derby.jdbc.EmbeddedDriver</value>
+ <description>Driver class name for a JDBC metastore</description>
+ </property>
+
+ <property>
+ <name>javax.jdo.PersistenceManagerFactoryClass</name>
+ <value>org.datanucleus.jdo.JDOPersistenceManagerFactory</value>
+ <description>class implementing the jdo persistence</description>
+ </property>
+
+ <property>
+ <name>datanucleus.connectionPoolingType</name>
+ <value>DBCP</value>
+ <description>Uses a DBCP connection pool for JDBC metastore
+ </description>
+ </property>
+
+ <property>
+ <name>javax.jdo.option.DetachAllOnCommit</name>
+ <value>true</value>
+ <description>detaches all objects from session so that they can be
+ used after transaction is committed</description>
+ </property>
+
+ <property>
+ <name>javax.jdo.option.NonTransactionalRead</name>
+ <value>true</value>
+ <description>reads outside of transactions</description>
+ </property>
+
+ <property>
+ <name>javax.jdo.option.ConnectionUserName</name>
+ <value>APP</value>
+ <description>username to use against metastore database</description>
+ </property>
+
+ <property>
+ <name>javax.jdo.option.ConnectionPassword</name>
+ <value>mine</value>
+ <description>password to use against metastore database</description>
+ </property>
+
+ <property>
+ <name>datanucleus.validateTables</name>
+ <value>false</value>
+ <description>validates existing schema against code. turn this on if
+ you want to verify existing schema </description>
+ </property>
+
+ <property>
+ <name>datanucleus.validateColumns</name>
+ <value>false</value>
+ <description>validates existing schema against code. turn this on if
+ you want to verify existing schema </description>
+ </property>
+
+ <property>
+ <name>datanucleus.validateConstraints</name>
+ <value>false</value>
+ <description>validates existing schema against code. turn this on if
+ you want to verify existing schema </description>
+ </property>
+
+ <property>
+ <name>datanucleus.storeManagerType</name>
+ <value>rdbms</value>
+ <description>metadata store type</description>
+ </property>
+
+ <property>
+ <name>datanucleus.autoCreateSchema</name>
+ <value>true</value>
+ <description>creates necessary schema on a startup if one doesn't
+ exist. set this to false, after creating it once</description>
+ </property>
+
+ <property>
+ <name>datanucleus.autoStartMechanismMode</name>
+ <value>checked</value>
+ <description>throw exception if metadata tables are incorrect
+ </description>
+ </property>
+
+ <property>
+ <name>datanucleus.transactionIsolation</name>
+ <value>read-committed</value>
+ <description>Default transaction isolation level for identity
+ generation. </description>
+ </property>
+
+ <property>
+ <name>datanucleus.cache.level2</name>
+ <value>false</value>
+ <description>Use a level 2 cache. Turn this off if metadata is changed
+ independently of hive metastore server</description>
+ </property>
+
+ <property>
+ <name>datanucleus.cache.level2.type</name>
+ <value>SOFT</value>
+ <description>SOFT=soft reference based cache, WEAK=weak reference
+ based cache.</description>
+ </property>
+
+ <property>
+ <name>datanucleus.identifierFactory</name>
+ <value>datanucleus</value>
+ <description>Name of the identifier factory to use when generating
+ table/column names etc. 'datanucleus' is used for backward
+ compatibility</description>
+ </property>
+
+ <property>
+ <name>hive.metastore.warehouse.dir</name>
+ <value>/user/hivesterix</value>
+ <description>location of default database for the warehouse
+ </description>
+ </property>
+
+ <property>
+ <name>hive.metastore.connect.retries</name>
+ <value>5</value>
+ <description>Number of retries while opening a connection to metastore
+ </description>
+ </property>
+
+ <property>
+ <name>hive.metastore.rawstore.impl</name>
+ <value>org.apache.hadoop.hive.metastore.ObjectStore</value>
+ <description>Name of the class that implements
+ org.apache.hadoop.hive.metastore.rawstore interface. This class is
+ used to store and retrieval of raw metadata objects such as table,
+ database</description>
+ </property>
+
+ <property>
+ <name>hive.default.fileformat</name>
+ <value>TextFile</value>
+ <description>Default file format for CREATE TABLE statement. Options
+ are TextFile and SequenceFile. Users can explicitly say CREATE TABLE
+ ... STORED AS <TEXTFILE|SEQUENCEFILE> to override</description>
+ </property>
+
+ <property>
+ <name>hive.fileformat.check</name>
+ <value>true</value>
+ <description>Whether to check file format or not when loading data
+ files</description>
+ </property>
+
+ <property>
+ <name>hive.map.aggr</name>
+ <value>true</value>
+ <description>Whether to use map-side aggregation in Hive Group By
+ queries</description>
+ </property>
+
+ <property>
+ <name>hive.groupby.skewindata</name>
+ <value>false</value>
+ <description>Whether there is skew in data to optimize group by
+ queries</description>
+ </property>
+
+ <property>
+ <name>hive.groupby.mapaggr.checkinterval</name>
+ <value>100000</value>
+ <description>Number of rows after which size of the grouping
+ keys/aggregation classes is performed</description>
+ </property>
+
+ <property>
+ <name>hive.mapred.local.mem</name>
+ <value>0</value>
+ <description>For local mode, memory of the mappers/reducers
+ </description>
+ </property>
+
+ <property>
+ <name>hive.map.aggr.hash.percentmemory</name>
+ <value>0.5</value>
+ <description>Portion of total memory to be used by map-side grup
+ aggregation hash table</description>
+ </property>
+
+ <property>
+ <name>hive.map.aggr.hash.min.reduction</name>
+ <value>0.5</value>
+ <description>Hash aggregation will be turned off if the ratio between
+ hash
+ table size and input rows is bigger than this number. Set to 1 to make
+ sure
+ hash aggregation is never turned off.</description>
+ </property>
+
+ <property>
+ <name>hive.optimize.cp</name>
+ <value>true</value>
+ <description>Whether to enable column pruner</description>
+ </property>
+
+ <property>
+ <name>hive.optimize.ppd</name>
+ <value>true</value>
+ <description>Whether to enable predicate pushdown</description>
+ </property>
+
+ <property>
+ <name>hive.optimize.pruner</name>
+ <value>true</value>
+ <description>Whether to enable the new partition pruner which depends
+ on predicate pushdown. If this is disabled,
+ the old partition pruner which is based on AST will be enabled.
+ </description>
+ </property>
+
+ <property>
+ <name>hive.optimize.groupby</name>
+ <value>true</value>
+ <description>Whether to enable the bucketed group by from bucketed
+ partitions/tables.</description>
+ </property>
+
+ <property>
+ <name>hive.join.emit.interval</name>
+ <value>1000</value>
+ <description>How many rows in the right-most join operand Hive should
+ buffer before emitting the join result. </description>
+ </property>
+
+ <property>
+ <name>hive.join.cache.size</name>
+ <value>25000</value>
+ <description>How many rows in the joining tables (except the streaming
+ table) should be cached in memory. </description>
+ </property>
+
+ <property>
+ <name>hive.mapjoin.bucket.cache.size</name>
+ <value>100</value>
+ <description>How many values in each keys in the map-joined table
+ should be cached in memory. </description>
+ </property>
+
+ <property>
+ <name>hive.mapjoin.maxsize</name>
+ <value>100000</value>
+ <description>Maximum # of rows of the small table that can be handled
+ by map-side join. If the size is reached and hive.task.progress is
+ set, a fatal error counter is set and the job will be killed.
+ </description>
+ </property>
+
+ <property>
+ <name>hive.mapjoin.cache.numrows</name>
+ <value>25000</value>
+ <description>How many rows should be cached by jdbm for map join.
+ </description>
+ </property>
+
+ <property>
+ <name>hive.optimize.skewjoin</name>
+ <value>false</value>
+ <description>Whether to enable skew join optimization. </description>
+ </property>
+
+ <property>
+ <name>hive.skewjoin.key</name>
+ <value>100000</value>
+ <description>Determine if we get a skew key in join. If we see more
+ than the specified number of rows with the same key in join operator,
+ we think the key as a skew join key. </description>
+ </property>
+
+ <property>
+ <name>hive.skewjoin.mapjoin.map.tasks</name>
+ <value>10000</value>
+ <description> Determine the number of map task used in the follow up
+ map join job
+ for a skew join. It should be used together with
+ hive.skewjoin.mapjoin.min.split
+ to perform a fine grained control.</description>
+ </property>
+
+ <property>
+ <name>hive.skewjoin.mapjoin.min.split</name>
+ <value>33554432</value>
+ <description> Determine the number of map task at most used in the
+ follow up map join job
+ for a skew join by specifying the minimum split size. It should be used
+ together with
+ hive.skewjoin.mapjoin.map.tasks to perform a fine grained control.</description>
+ </property>
+
+ <property>
+ <name>hive.mapred.mode</name>
+ <value>nonstrict</value>
+ <description>The mode in which the hive operations are being
+ performed. In strict mode, some risky queries are not allowed to run
+ </description>
+ </property>
+
+ <property>
+ <name>hive.exec.script.maxerrsize</name>
+ <value>100000</value>
+ <description>Maximum number of bytes a script is allowed to emit to
+ standard error (per map-reduce task). This prevents runaway scripts
+ from filling logs partitions to capacity </description>
+ </property>
+
+ <property>
+ <name>hive.exec.script.allow.partial.consumption</name>
+ <value>false</value>
+ <description> When enabled, this option allows a user script to exit
+ successfully without consuming all the data from the standard input.
+ </description>
+ </property>
+
+ <property>
+ <name>hive.script.operator.id.env.var</name>
+ <value>HIVE_SCRIPT_OPERATOR_ID</value>
+ <description> Name of the environment variable that holds the unique
+ script operator ID in the user's transform function (the custom
+ mapper/reducer that the user has specified in the query)
+ </description>
+ </property>
+
+ <property>
+ <name>hive.exec.compress.output</name>
+ <value>false</value>
+ <description> This controls whether the final outputs of a query (to a
+ local/hdfs file or a hive table) is compressed. The compression codec
+ and other options are determined from hadoop config variables
+ mapred.output.compress* </description>
+ </property>
+
+ <property>
+ <name>hive.exec.compress.intermediate</name>
+ <value>false</value>
+ <description> This controls whether intermediate files produced by
+ hive between multiple map-reduce jobs are compressed. The compression
+ codec and other options are determined from hadoop config variables
+ mapred.output.compress* </description>
+ </property>
+
+ <property>
+ <name>hive.exec.parallel</name>
+ <value>false</value>
+ <description>Whether to execute jobs in parallel</description>
+ </property>
+
+ <property>
+ <name>hive.exec.parallel.thread.number</name>
+ <value>8</value>
+ <description>How many jobs at most can be executed in parallel
+ </description>
+ </property>
+
+ <property>
+ <name>hive.hwi.war.file</name>
+ <value>lib\hive-hwi-0.7.0.war</value>
+ <description>This sets the path to the HWI war file, relative to
+ ${HIVE_HOME}. </description>
+ </property>
+
+ <property>
+ <name>hive.hwi.listen.host</name>
+ <value>0.0.0.0</value>
+ <description>This is the host address the Hive Web Interface will
+ listen on</description>
+ </property>
+
+ <property>
+ <name>hive.hwi.listen.port</name>
+ <value>9999</value>
+ <description>This is the port the Hive Web Interface will listen on
+ </description>
+ </property>
+
+ <property>
+ <name>hive.exec.pre.hooks</name>
+ <value></value>
+ <description>Pre Execute Hook for Tests</description>
+ </property>
+
+ <property>
+ <name>hive.merge.mapfiles</name>
+ <value>true</value>
+ <description>Merge small files at the end of a map-only job
+ </description>
+ </property>
+
+ <property>
+ <name>hive.merge.mapredfiles</name>
+ <value>false</value>
+ <description>Merge small files at the end of a map-reduce job
+ </description>
+ </property>
+
+ <property>
+ <name>hive.heartbeat.interval</name>
+ <value>1000</value>
+ <description>Send a heartbeat after this interval - used by mapjoin
+ and filter operators</description>
+ </property>
+
+ <property>
+ <name>hive.merge.size.per.task</name>
+ <value>256000000</value>
+ <description>Size of merged files at the end of the job</description>
+ </property>
+
+ <property>
+ <name>hive.merge.size.smallfiles.avgsize</name>
+ <value>16000000</value>
+ <description>When the average output file size of a job is less than
+ this number, Hive will start an additional map-reduce job to merge
+ the output files into bigger files. This is only done for map-only
+ jobs if hive.merge.mapfiles is true, and for map-reduce jobs if
+ hive.merge.mapredfiles is true.</description>
+ </property>
+
+ <property>
+ <name>hive.script.auto.progress</name>
+ <value>false</value>
+ <description>Whether Hive Tranform/Map/Reduce Clause should
+ automatically send progress information to TaskTracker to avoid the
+ task getting killed because of inactivity. Hive sends progress
+ information when the script is outputting to stderr. This option
+ removes the need of periodically producing stderr messages, but users
+ should be cautious because this may prevent infinite loops in the
+ scripts to be killed by TaskTracker. </description>
+ </property>
+
+ <property>
+ <name>hive.script.serde</name>
+ <value>org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe</value>
+ <description>The default serde for trasmitting input data to and
+ reading output data from the user scripts. </description>
+ </property>
+
+ <property>
+ <name>hive.script.recordreader</name>
+ <value>org.apache.hadoop.hive.ql.exec.TextRecordReader</value>
+ <description>The default record reader for reading data from the user
+ scripts. </description>
+ </property>
+
+ <property>
+ <name>hive.script.recordwriter</name>
+ <value>org.apache.hadoop.hive.ql.exec.TextRecordWriter</value>
+ <description>The default record writer for writing data to the user
+ scripts. </description>
+ </property>
+
+ <property>
+ <name>hive.input.format</name>
+ <value>org.apache.hadoop.hive.ql.io.HiveInputFormat</value>
+ <description>The default input format, if it is not specified, the
+ system assigns it. It is set to HiveInputFormat for hadoop versions
+ 17, 18 and 19, whereas it is set to CombinedHiveInputFormat for
+ hadoop 20. The user can always overwrite it - if there is a bug in
+ CombinedHiveInputFormat, it can always be manually set to
+ HiveInputFormat. </description>
+ </property>
+
+ <property>
+ <name>hive.udtf.auto.progress</name>
+ <value>false</value>
+ <description>Whether Hive should automatically send progress
+ information to TaskTracker when using UDTF's to prevent the task
+ getting killed because of inactivity. Users should be cautious
+ because this may prevent TaskTracker from killing tasks with infinte
+ loops. </description>
+ </property>
+
+ <property>
+ <name>hive.mapred.reduce.tasks.speculative.execution</name>
+ <value>true</value>
+ <description>Whether speculative execution for reducers should be
+ turned on. </description>
+ </property>
+
+ <property>
+ <name>hive.exec.counters.pull.interval</name>
+ <value>1000</value>
+ <description>The interval with which to poll the JobTracker for the
+ counters the running job. The smaller it is the more load there will
+ be on the jobtracker, the higher it is the less granular the caught
+ will be.</description>
+ </property>
+
+ <property>
+ <name>hive.enforce.bucketing</name>
+ <value>false</value>
+ <description>Whether bucketing is enforced. If true, while inserting
+ into the table, bucketing is enforced. </description>
+ </property>
+
+ <property>
+ <name>hive.enforce.sorting</name>
+ <value>false</value>
+ <description>Whether sorting is enforced. If true, while inserting
+ into the table, sorting is enforced. </description>
+ </property>
+
+ <property>
+ <name>hive.metastore.ds.connection.url.hook</name>
+ <value></value>
+ <description>Name of the hook to use for retriving the JDO connection
+ URL. If empty, the value in javax.jdo.option.ConnectionURL is used
+ </description>
+ </property>
+
+ <property>
+ <name>hive.metastore.ds.retry.attempts</name>
+ <value>1</value>
+ <description>The number of times to retry a metastore call if there
+ were a connection error</description>
+ </property>
+
+ <property>
+ <name>hive.metastore.ds.retry.interval</name>
+ <value>1000</value>
+ <description>The number of miliseconds between metastore retry
+ attempts</description>
+ </property>
+
+ <property>
+ <name>hive.metastore.server.min.threads</name>
+ <value>200</value>
+ <description>Minimum number of worker threads in the Thrift server's
+ pool.</description>
+ </property>
+
+ <property>
+ <name>hive.metastore.server.max.threads</name>
+ <value>100000</value>
+ <description>Maximum number of worker threads in the Thrift server's
+ pool.</description>
+ </property>
+
+ <property>
+ <name>hive.metastore.server.tcp.keepalive</name>
+ <value>true</value>
+ <description>Whether to enable TCP keepalive for the metastore server.
+ Keepalive will prevent accumulation of half-open connections.
+ </description>
+ </property>
+
+ <property>
+ <name>hive.optimize.reducededuplication</name>
+ <value>true</value>
+ <description>Remove extra map-reduce jobs if the data is already
+ clustered by the same key which needs to be used again. This should
+ always be set to true. Since it is a new feature, it has been made
+ configurable.</description>
+ </property>
+
+ <property>
+ <name>hive.exec.dynamic.partition</name>
+ <value>false</value>
+ <description>Whether or not to allow dynamic partitions in DML/DDL.
+ </description>
+ </property>
+
+ <property>
+ <name>hive.exec.dynamic.partition.mode</name>
+ <value>strict</value>
+ <description>In strict mode, the user must specify at least one static
+ partition in case the user accidentally overwrites all partitions.
+ </description>
+ </property>
+
+ <property>
+ <name>hive.exec.max.dynamic.partitions</name>
+ <value>1000</value>
+ <description>Maximum number of dynamic partitions allowed to be
+ created in total.</description>
+ </property>
+
+ <property>
+ <name>hive.exec.max.dynamic.partitions.pernode</name>
+ <value>100</value>
+ <description>Maximum number of dynamic partitions allowed to be
+ created in each mapper/reducer node.</description>
+ </property>
+
+ <property>
+ <name>hive.default.partition.name</name>
+ <value>__HIVE_DEFAULT_PARTITION__</value>
+ <description>The default partition name in case the dynamic partition
+ column value is null/empty string or anyother values that cannot be
+ escaped. This value must not contain any special character used in
+ HDFS URI (e.g., ':', '%', '/' etc). The user has to be aware that the
+ dynamic partition value should not contain this value to avoid
+ confusions.</description>
+ </property>
+
+ <property>
+ <name>fs.har.impl</name>
+ <value>org.apache.hadoop.hive.shims.HiveHarFileSystem</value>
+ <description>The implementation for accessing Hadoop Archives. Note
+ that this won't be applicable to Hadoop vers less than 0.20
+ </description>
+ </property>
+
+ <property>
+ <name>hive.archive.enabled</name>
+ <value>false</value>
+ <description>Whether archiving operations are permitted</description>
+ </property>
+
+ <property>
+ <name>hive.archive.har.parentdir.settable</name>
+ <value>false</value>
+ <description>In new Hadoop versions, the parent directory must be set
+ while
+ creating a HAR. Because this functionality is hard to detect with just
+ version
+ numbers, this conf var needs to be set manually.</description>
+ </property>
+
+ <!-- HBase Storage Handler Parameters -->
+
+ <property>
+ <name>hive.hbase.wal.enabled</name>
+ <value>true</value>
+ <description>Whether writes to HBase should be forced to the
+ write-ahead log. Disabling this improves HBase write performance at
+ the risk of lost writes in case of a crash.</description>
+ </property>
+
+</configuration>
diff --git a/hivesterix/hivesterix-dist/src/main/resources/conf/hive-log4j.properties b/hivesterix/hivesterix-dist/src/main/resources/conf/hive-log4j.properties
new file mode 100644
index 0000000..784a274
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/conf/hive-log4j.properties
@@ -0,0 +1,58 @@
+#------------------------------------------------------------------------------
+#
+# The following properties set the logging levels and log appender. The
+# log4j.rootCategory variable defines the default log level and one or more
+# appenders. For the console, use 'S'. For the daily rolling file, use 'R'.
+# For an HTML formatted log, use 'H'.
+#
+# To override the default (rootCategory) log level, define a property of the
+# form (see below for available values):
+#
+# log4j.logger. =
+#
+# Available logger names:
+# TODO
+#
+# Possible Log Levels:
+# FATAL, ERROR, WARN, INFO, DEBUG
+#
+#------------------------------------------------------------------------------
+log4j.rootCategory=INFO, S
+
+log4j.logger.com.dappit.Dapper.parser=ERROR
+log4j.logger.org.w3c.tidy=FATAL
+
+#------------------------------------------------------------------------------
+#
+# The following properties configure the console (stdout) appender.
+# See http://logging.apache.org/log4j/docs/api/index.html for details.
+#
+#------------------------------------------------------------------------------
+log4j.appender.S = org.apache.log4j.ConsoleAppender
+log4j.appender.S.layout = org.apache.log4j.PatternLayout
+log4j.appender.S.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %c{1} [%p] %m%n
+
+#------------------------------------------------------------------------------
+#
+# The following properties configure the Daily Rolling File appender.
+# See http://logging.apache.org/log4j/docs/api/index.html for details.
+#
+#------------------------------------------------------------------------------
+log4j.appender.R = org.apache.log4j.DailyRollingFileAppender
+log4j.appender.R.File = logs/bensApps.log
+log4j.appender.R.Append = true
+log4j.appender.R.DatePattern = '.'yyy-MM-dd
+log4j.appender.R.layout = org.apache.log4j.PatternLayout
+log4j.appender.R.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} %c{1} [%p] %m%n
+
+#------------------------------------------------------------------------------
+#
+# The following properties configure the Rolling File appender in HTML.
+# See http://logging.apache.org/log4j/docs/api/index.html for details.
+#
+#------------------------------------------------------------------------------
+log4j.appender.H = org.apache.log4j.RollingFileAppender
+log4j.appender.H.File = logs/bensApps.html
+log4j.appender.H.MaxFileSize = 100KB
+log4j.appender.H.Append = false
+log4j.appender.H.layout = org.apache.log4j.HTMLLayout
diff --git a/hivesterix/hivesterix-dist/src/main/resources/conf/master b/hivesterix/hivesterix-dist/src/main/resources/conf/master
new file mode 100644
index 0000000..2fbb50c
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/conf/master
@@ -0,0 +1 @@
+localhost
diff --git a/hivesterix/hivesterix-dist/src/main/resources/conf/slaves b/hivesterix/hivesterix-dist/src/main/resources/conf/slaves
new file mode 100644
index 0000000..2fbb50c
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/conf/slaves
@@ -0,0 +1 @@
+localhost
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/cli.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/cli.sh
new file mode 100644
index 0000000..914aae3
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/cli.sh
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=cli
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+cli () {
+ CLASS=org.apache.hadoop.hive.cli.CliDriver
+ execHiveCmd $CLASS "$@"
+}
+
+cli_help () {
+ CLASS=org.apache.hadoop.hive.cli.CliDriver
+ execHiveCmd $CLASS "--help"
+}
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/help.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/help.sh
new file mode 100644
index 0000000..432859a
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/help.sh
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=help
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+help() {
+ echo "Usage ./hive <parameters> --service serviceName <service parameters>"
+ echo "Service List: $SERVICE_LIST"
+ echo "Parameters parsed:"
+ echo " --auxpath : Auxillary jars "
+ echo " --config : Hive configuration directory"
+ echo " --service : Starts specific service/component. cli is default"
+ echo "Parameters used:"
+ echo " HADOOP_HOME : Hadoop install directory"
+ echo " HIVE_OPT : Hive options"
+ echo "For help on a particular service:"
+ echo " ./hive --service serviceName --help"
+}
+
+help_help(){
+ help
+}
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/hiveserver.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/hiveserver.sh
new file mode 100644
index 0000000..b5edce4
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/hiveserver.sh
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=hiveserver
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+hiveserver() {
+ echo "Starting Hive Thrift Server"
+ CLASS=org.apache.hadoop.hive.service.HiveServer
+ if $cygwin; then
+ HIVE_LIB=`cygpath -w "$HIVE_LIB"`
+ fi
+ JAR=${HIVE_LIB}/hive-service-*.jar
+
+ # hadoop 20 or newer - skip the aux_jars option and hiveconf
+ exec $HADOOP jar $JAR $CLASS $HIVE_PORT "$@"
+}
+
+hiveserver_help() {
+ echo "usage HIVE_PORT=xxxx ./hive --service hiveserver"
+ echo " HIVE_PORT : Specify the server port"
+}
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/hwi.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/hwi.sh
new file mode 100644
index 0000000..f9cd8ec
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/hwi.sh
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=hwi
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+hwi() {
+
+ if $cygwin; then
+ HIVE_LIB=`cygpath -w "$HIVE_LIB"`
+ fi
+
+ CLASS=org.apache.hadoop.hive.hwi.HWIServer
+ # The ls hack forces the * to be expanded which is required because
+ # System.getenv doesn't do globbing
+ export HWI_JAR_FILE=$(ls ${HIVE_LIB}/hive-hwi-*.jar)
+ export HWI_WAR_FILE=$(ls ${HIVE_LIB}/hive-hwi-*.war)
+
+ #hwi requires ant jars
+ if [ "$ANT_LIB" = "" ] ; then
+ ANT_LIB=/opt/ant/lib
+ fi
+ for f in ${ANT_LIB}/*.jar; do
+ if [[ ! -f $f ]]; then
+ continue;
+ fi
+ HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:$f
+ done
+
+ export HADOOP_CLASSPATH
+
+ # hadoop 20 or newer - skip the aux_jars option and hiveconf
+ exec $HADOOP jar ${HWI_JAR_FILE} $CLASS $HIVE_OPTS "$@"
+}
+
+hwi_help(){
+ echo "Usage ANT_LIB=XXXX hive --service hwi"
+}
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/jar.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/jar.sh
new file mode 100644
index 0000000..b52f9a7
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/jar.sh
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=jar
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+jar () {
+ RUNJAR=$1
+ shift
+
+ RUNCLASS=$1
+ shift
+
+ if $cygwin; then
+ HIVE_LIB=`cygpath -w "$HIVE_LIB"`
+ fi
+
+ if [ -z "$RUNJAR" ] ; then
+ echo "RUNJAR not specified"
+ exit 3
+ fi
+
+ if [ -z "$RUNCLASS" ] ; then
+ echo "RUNCLASS not specified"
+ exit 3
+ fi
+
+ # hadoop 20 or newer - skip the aux_jars option and hiveconf
+ exec $HADOOP jar $RUNJAR $RUNCLASS $HIVE_OPTS "$@"
+}
+
+jar_help () {
+ echo "Used for applications that require Hadoop and Hive classpath and environment."
+ echo "./hive --service jar <yourjar> <yourclass> HIVE_OPTS <your_args>"
+}
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/lineage.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/lineage.sh
new file mode 100644
index 0000000..993bc8d
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/lineage.sh
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=lineage
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+lineage () {
+ CLASS=org.apache.hadoop.hive.ql.tools.LineageInfo
+
+ # cli specific code
+ if [ ! -f ${HIVE_LIB}/hive-exec-*.jar ]; then
+ echo "Missing Hive exec Jar"
+ exit 3;
+ fi
+
+ if $cygwin; then
+ HIVE_LIB=`cygpath -w "$HIVE_LIB"`
+ fi
+
+ exec $HADOOP jar ${HIVE_LIB}/hive-exec-*.jar $CLASS "$@"
+}
+
+lineage_help () {
+ echo "usage ./hive 'hql' "
+}
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/metastore.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/metastore.sh
new file mode 100644
index 0000000..db15f6e
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/metastore.sh
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=metastore
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+metastore() {
+ echo "Starting Hive Metastore Server"
+ CLASS=org.apache.hadoop.hive.metastore.HiveMetaStore
+ if $cygwin; then
+ HIVE_LIB=`cygpath -w "$HIVE_LIB"`
+ fi
+ JAR=${HIVE_LIB}/hive-service-*.jar
+
+ # hadoop 20 or newer - skip the aux_jars option and hiveconf
+ exec $HADOOP jar $JAR $CLASS $METASTORE_PORT "$@"
+}
+
+metastore_help() {
+ echo "usage METASTORE_PORT=xxxx ./hive --service metastore"
+ echo " METASTORE_PORT : Specify the metastore server port"
+}
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/rcfilecat.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/rcfilecat.sh
new file mode 100644
index 0000000..3a9264b
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/rcfilecat.sh
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+THISSERVICE=rcfilecat
+export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
+
+rcfilecat () {
+ CLASS=org.apache.hadoop.hive.cli.RCFileCat
+ HIVE_OPTS=''
+ execHiveCmd $CLASS "$@"
+}
+
+rcfilecat_help () {
+ echo "usage ./hive rcfilecat [--start='startoffset'] [--length='len'] "
+}
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/util/execHiveCmd.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/util/execHiveCmd.sh
new file mode 100644
index 0000000..167cc40
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/ext/util/execHiveCmd.sh
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+execHiveCmd () {
+ CLASS=$1;
+ shift;
+
+ # cli specific code
+ if [ ! -f ${HIVE_LIB}/hive-cli-*.jar ]; then
+ echo "Missing Hive CLI Jar"
+ exit 3;
+ fi
+
+ if $cygwin; then
+ HIVE_LIB=`cygpath -w "$HIVE_LIB"`
+ fi
+
+ # hadoop 20 or newer - skip the aux_jars option. picked up from hiveconf
+ exec $HADOOP jar ${HIVE_LIB}/hive-cli-*.jar $CLASS $HIVE_OPTS "$@"
+}
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/getip.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/getip.sh
new file mode 100755
index 0000000..8c9ae76
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/getip.sh
@@ -0,0 +1,25 @@
+#get the OS
+OS_NAME=`uname -a|awk '{print $1}'`
+LINUX_OS='Linux'
+
+if [ $OS_NAME = $LINUX_OS ];
+then
+ #Get IP Address
+ IPADDR=`/sbin/ifconfig eth0 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ if [ "$IPADDR" = "" ]
+ then
+ IPADDR=`/sbin/ifconfig em1 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ fi
+ if [ "$IPADDR" = "" ]
+ then
+ IPADDR=`/sbin/ifconfig lo | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ fi
+else
+ IPADDR=`/sbin/ifconfig en1 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ if [ "$IPADDR" = "" ]
+ then
+ IPADDR=`/sbin/ifconfig lo0 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ fi
+
+fi
+echo $IPADDR
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/hive b/hivesterix/hivesterix-dist/src/main/resources/scripts/hive
new file mode 100755
index 0000000..f98f340
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/hive
@@ -0,0 +1,213 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cygwin=false
+case "`uname`" in
+ CYGWIN*) cygwin=true;;
+esac
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/hive-config.sh
+
+SERVICE=""
+HELP=""
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --service)
+ shift
+ SERVICE=$1
+ shift
+ ;;
+ --rcfilecat)
+ SERVICE=rcfilecat
+ shift
+ ;;
+ --help)
+ HELP=_help
+ shift
+ ;;
+ *)
+ break
+ ;;
+ esac
+done
+
+if [ "$SERVICE" = "" ] ; then
+ if [ "$HELP" = "_help" ] ; then
+ SERVICE="help"
+ else
+ SERVICE="cli"
+ fi
+fi
+
+if [ -f "${HIVE_CONF_DIR}/hive-env.sh" ]; then
+ . "${HIVE_CONF_DIR}/hive-env.sh"
+fi
+
+CLASSPATH="${HIVE_CONF_DIR}"
+
+HIVE_LIB=${HIVE_HOME}/lib
+
+# needed for execution
+if [ ! -f ${HIVE_LIB}/hive-exec-*.jar ]; then
+ echo "Missing Hive Execution Jar: ${HIVE_LIB}/hive-exec-*.jar"
+ exit 1;
+fi
+
+if [ ! -f ${HIVE_LIB}/hive-metastore-*.jar ]; then
+ echo "Missing Hive MetaStore Jar"
+ exit 2;
+fi
+
+# cli specific code
+if [ ! -f ${HIVE_LIB}/hive-cli-*.jar ]; then
+ echo "Missing Hive CLI Jar"
+ exit 3;
+fi
+
+CLASSPATH=${CLASSPATH}:${HIVE_LIB}/a-hive-path.jar
+
+for f in ${HIVE_LIB}/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+# add the auxillary jars such as serdes
+if [ -d "${HIVE_AUX_JARS_PATH}" ]; then
+ for f in ${HIVE_AUX_JARS_PATH}/*.jar; do
+ if [[ ! -f $f ]]; then
+ continue;
+ fi
+ if $cygwin; then
+ f=`cygpath -w "$f"`
+ fi
+ AUX_CLASSPATH=${AUX_CLASSPATH}:$f
+ if [ "${AUX_PARAM}" == "" ]; then
+ AUX_PARAM=file://$f
+ else
+ AUX_PARAM=${AUX_PARAM},file://$f;
+ fi
+ done
+elif [ "${HIVE_AUX_JARS_PATH}" != "" ]; then
+ if $cygwin; then
+ HIVE_AUX_JARS_PATH=`echo $HIVE_AUX_JARS_PATH | sed 's/,/:/g'`
+ HIVE_AUX_JARS_PATH=`cygpath -p -w "$HIVE_AUX_JARS_PATH"`
+ HIVE_AUX_JARS_PATH=`echo $HIVE_AUX_JARS_PATH | sed 's/;/,/g'`
+ fi
+ AUX_CLASSPATH=${HIVE_AUX_JARS_PATH}
+ AUX_PARAM=file://${HIVE_AUX_JARS_PATH}
+ AUX_PARAM=`echo $AUX_PARAM | sed 's/,/,file:\/\//g'`
+fi
+
+# adding jars from auxlib directory
+for f in ${HIVE_HOME}/auxlib/*.jar; do
+ if [[ ! -f $f ]]; then
+ continue;
+ fi
+ if $cygwin; then
+ f=`cygpath -w "$f"`
+ fi
+ AUX_CLASSPATH=${AUX_CLASSPATH}:$f
+ if [ "${AUX_PARAM}" == "" ]; then
+ AUX_PARAM=file://$f
+ else
+ AUX_PARAM=${AUX_PARAM},file://$f;
+ fi
+done
+if $cygwin; then
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
+ CLASSPATH=${CLASSPATH};${AUX_CLASSPATH}
+else
+ CLASSPATH=${CLASSPATH}:${AUX_CLASSPATH}
+fi
+
+# pass classpath to hadoop
+export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${CLASSPATH}"
+
+# check for hadoop in the path
+HADOOP_IN_PATH=`which hadoop 2>/dev/null`
+if [ -f ${HADOOP_IN_PATH} ]; then
+ HADOOP_DIR=`dirname "$HADOOP_IN_PATH"`/..
+fi
+# HADOOP_HOME env variable overrides hadoop in the path
+HADOOP_HOME=${HADOOP_HOME:-$HADOOP_DIR}
+if [ "$HADOOP_HOME" == "" ]; then
+ echo "Cannot find hadoop installation: \$HADOOP_HOME must be set or hadoop must be in the path";
+ exit 4;
+fi
+
+HADOOP=$HADOOP_HOME/bin/hadoop
+if [ ! -f ${HADOOP} ]; then
+ echo "Cannot find hadoop installation: \$HADOOP_HOME must be set or hadoop must be in the path";
+ exit 4;
+fi
+
+# Make sure we're using a compatible version of Hadoop
+hadoop_version=$($HADOOP version | awk '{if (NR == 1) {print $2;}}');
+
+# Save the regex to a var to workaround quoting incompatabilities
+# between Bash 3.1 and 3.2
+hadoop_version_re="^([[:digit:]]+)\.([[:digit:]]+)(\.([[:digit:]]+))?.*$"
+
+if [[ "$hadoop_version" =~ $hadoop_version_re ]]; then
+ hadoop_major_ver=${BASH_REMATCH[1]}
+ hadoop_minor_ver=${BASH_REMATCH[2]}
+ hadoop_patch_ver=${BASH_REMATCH[4]}
+else
+ echo "Unable to determine Hadoop version information."
+ echo "'hadoop version' returned:"
+ echo `$HADOOP version`
+ exit 5
+fi
+
+if [ $hadoop_minor_ver -ne 20 -o $hadoop_patch_ver -eq 0 ]; then
+ echo "Hive requires Hadoop 0.20.x (x >= 1)."
+ echo "'hadoop version' returned:"
+ echo `$HADOOP version`
+ exit 6
+fi
+
+if [ "${AUX_PARAM}" != "" ]; then
+ HIVE_OPTS="$HIVE_OPTS -hiveconf hive.aux.jars.path=${AUX_PARAM}"
+ AUX_JARS_CMD_LINE="-libjars ${AUX_PARAM}"
+fi
+
+SERVICE_LIST=""
+
+for i in "$bin"/ext/*.sh ; do
+ . $i
+done
+
+for i in "$bin"/ext/util/*.sh ; do
+ . $i
+done
+
+TORUN=""
+for j in $SERVICE_LIST ; do
+ if [ "$j" = "$SERVICE" ] ; then
+ TORUN=${j}$HELP
+ fi
+done
+
+if [ "$TORUN" = "" ] ; then
+ echo "Service $SERVICE not found"
+ echo "Available Services: $SERVICE_LIST"
+ exit 7
+else
+ $TORUN "$@"
+fi
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/hive-config.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/hive-config.sh
new file mode 100755
index 0000000..2524bbc
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/hive-config.sh
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# processes --config option from command line
+#
+
+this="$0"
+while [ -h "$this" ]; do
+ ls=`ls -ld "$this"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ this="$link"
+ else
+ this=`dirname "$this"`/"$link"
+ fi
+done
+
+# convert relative path to absolute path
+bin=`dirname "$this"`
+script=`basename "$this"`
+bin=`cd "$bin"; pwd`
+this="$bin/$script"
+
+# the root of the Hadoop installation
+export HIVE_HOME=`dirname "$bin"`
+
+#check to see if the conf dir is given as an optional argument
+while [ $# -gt 0 ]; do # Until you run out of parameters . . .
+ case "$1" in
+ --config)
+ shift
+ confdir=$1
+ shift
+ HIVE_CONF_DIR=$confdir
+ ;;
+ --auxpath)
+ shift
+ HIVE_AUX_JARS_PATH=$1
+ shift
+ ;;
+ *)
+ break;
+ ;;
+ esac
+done
+
+
+# Allow alternate conf dir location.
+HIVE_CONF_DIR="${HIVE_CONF_DIR:-$HIVE_HOME/conf}"
+
+export HIVE_CONF_DIR=$HIVE_CONF_DIR
+export HIVE_AUX_JARS_PATH=$HIVE_AUX_JARS_PATH
+
+# Default to use 256MB
+export HADOOP_HEAPSIZE=${HADOOP_HEAPSIZE:-256}
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/init-hive-dfs.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/init-hive-dfs.sh
new file mode 100755
index 0000000..ec3997a
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/init-hive-dfs.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The purpose of this script is to set warehouse's directories on HDFS
+
+DEFAULT_WAREHOUSE_DIR="/user/hive/warehouse"
+DEFAULT_TMP_DIR="/tmp"
+
+WAREHOUSE_DIR=${DEFAULT_WAREHOUSE_DIR}
+TMP_DIR=${DEFAULT_TMP_DIR}
+HELP=""
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --warehouse-dir)
+ shift
+ WAREHOUSE_DIR=$1
+ shift
+ ;;
+ --tmp-dir)
+ shift
+ TMP_DIR=$1
+ shift
+ ;;
+ --help)
+ HELP=_help
+ shift
+ ;;
+ *)
+ echo "Invalid parameter: $1"
+ HELP=_help
+ break
+ ;;
+ esac
+done
+
+if [ "$HELP" = "_help" ] ; then
+ echo "Usage $0 [--warehouse-dir <Hive user>] [--tmp-dir <Tmp dir>]"
+ echo "Default value of warehouse directory is: [$DEFAULT_WAREHOUSE_DIR]"
+ echo "Default value of the temporary directory is: [$DEFAULT_TMP_DIR]"
+ exit -1
+fi
+
+
+# check for hadoop in the path
+HADOOP_IN_PATH=`which hadoop 2>/dev/null`
+if [ -f ${HADOOP_IN_PATH} ]; then
+ HADOOP_DIR=`dirname "$HADOOP_IN_PATH"`/..
+fi
+# HADOOP_HOME env variable overrides hadoop in the path
+HADOOP_HOME=${HADOOP_HOME:-$HADOOP_DIR}
+if [ "$HADOOP_HOME" == "" ]; then
+ echo "Cannot find hadoop installation: \$HADOOP_HOME must be set or hadoop must be in the path";
+ exit 4;
+fi
+
+HADOOP_EXEC=$HADOOP_HOME/bin/hadoop
+if [ ! -f ${HADOOP} ]; then
+ echo "Cannot find hadoop installation: \$HADOOP_HOME must be set or hadoop must be in the path";
+ exit 4;
+fi
+
+
+# Ensure /tmp exist
+$HADOOP_EXEC fs -test -d ${TMP_DIR} > /dev/null 2>&1
+if [ $? -ne 0 ]
+then
+ echo "Creating directory [${TMP_DIR}]"
+ $HADOOP_EXEC fs -mkdir ${TMP_DIR}
+fi
+
+echo "Setting writeable group rights for directory [${TMP_DIR}]"
+$HADOOP_EXEC fs -chmod g+w ${TMP_DIR}
+
+
+# Ensure warehouse dir exist
+$HADOOP_EXEC fs -test -d ${WAREHOUSE_DIR} > /dev/null 2>&1
+if [ $? -ne 0 ]
+then
+ echo "Creating directory [${WAREHOUSE_DIR}]"
+ $HADOOP_EXEC fs -mkdir ${WAREHOUSE_DIR}
+fi
+
+echo "Setting writeable group rights for directory [${WAREHOUSE_DIR}]"
+$HADOOP_EXEC fs -chmod g+w ${WAREHOUSE_DIR}
+
+echo "Initialization done."
+echo
+echo "Please, do not forget to set the following configuration properties in hive-site.xml:"
+echo "hive.metastore.warehouse.dir=${WAREHOUSE_DIR}"
+echo "hive.exec.scratchdir=${TMP_DIR}"
+
+exit 0
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/startAllNCs.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/startAllNCs.sh
new file mode 100644
index 0000000..d30da26
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/startAllNCs.sh
@@ -0,0 +1,6 @@
+PREGELIX_PATH=`pwd`
+
+for i in `cat conf/slaves`
+do
+ ssh $i "cd ${PREGELIX_PATH}; export JAVA_HOME=${JAVA_HOME}; bin/startnc.sh"
+done
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/startCluster.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/startCluster.sh
new file mode 100644
index 0000000..6aa9161
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/startCluster.sh
@@ -0,0 +1,19 @@
+bin/startcc.sh
+sleep 5
+bin/startAllNCs.sh
+
+. conf/cluster.properties
+# do we need to specify the version somewhere?
+hyrackcmd=`ls ${HYRACKS_HOME}/hyracks-cli/target/hyracks-cli-*-binary-assembly/bin/hyrackscli`
+# find zip file
+appzip=`ls $PWD/../hivesterix-dist-*-binary-assembly.zip`
+
+[ -f $hyrackcmd ] || { echo "Hyracks commandline is missing"; exit -1;}
+[ -f $appzip ] || { echo "Genomix binary-assembly.zip is missing"; exit -1;}
+
+CCHOST_NAME=`cat conf/master`
+
+IPADDR=`bin/getip.sh`
+echo "connect to \"${IPADDR}:${CC_CLIENTPORT}\"; create application hivesterix \"$appzip\";" | $hyrackcmd
+echo ""
+
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/startDebugNc.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/startDebugNc.sh
new file mode 100755
index 0000000..fe6cf27
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/startDebugNc.sh
@@ -0,0 +1,50 @@
+hostname
+
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CURRENT_PATH=`pwd`
+CCHOST=`ssh ${CCHOST_NAME} "cd ${CURRENT_PATH}; bin/getip.sh"`
+
+#Import cluster properties
+. conf/cluster.properties
+. conf/debugnc.properties
+
+#Clean up temp dir
+
+rm -rf $NCTMP_DIR2
+mkdir $NCTMP_DIR2
+
+#Clean up log dir
+rm -rf $NCLOGS_DIR2
+mkdir $NCLOGS_DIR2
+
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS2 | tr "," "\n")
+for io_dir in $io_dirs
+do
+ rm -rf $io_dir
+ mkdir $io_dir
+done
+
+#Set JAVA_HOME
+export JAVA_HOME=$JAVA_HOME
+
+#Get OS
+IPADDR=`bin/getip.sh`
+
+#Get node ID
+NODEID=`hostname | cut -d '.' -f 1`
+NODEID=${NODEID}2
+
+#Set JAVA_OPTS
+export JAVA_OPTS=$NCJAVA_OPTS2
+
+cd $HYRACKS_HOME
+HYRACKS_HOME=`pwd`
+
+#Enter the temp dir
+cd $NCTMP_DIR2
+
+#Launch hyracks nc
+$HYRACKS_HOME/hyracks-server/target/appassembler/bin/hyracksnc -cc-host $CCHOST -cc-port $CC_CLUSTERPORT -cluster-net-ip-address $IPADDR -data-ip-address $IPADDR -node-id $NODEID -iodevices "${IO_DIRS2}" &> $NCLOGS_DIR2/$NODEID.log &
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/startcc.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/startcc.sh
new file mode 100644
index 0000000..efb79ce
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/startcc.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+hostname
+
+#Import cluster properties
+. conf/cluster.properties
+
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CCHOST=`bin/getip.sh`
+
+#Remove the temp dir
+rm -rf $CCTMP_DIR
+mkdir $CCTMP_DIR
+
+#Remove the logs dir
+rm -rf $CCLOGS_DIR
+mkdir $CCLOGS_DIR
+
+#Export JAVA_HOME and JAVA_OPTS
+export JAVA_HOME=$JAVA_HOME
+export JAVA_OPTS=$CCJAVA_OPTS
+
+#Launch hyracks cc script
+chmod -R 755 $HYRACKS_HOME
+$HYRACKS_HOME/hyracks-server/target/appassembler/bin/hyrackscc -client-net-ip-address $CCHOST -cluster-net-ip-address $CCHOST -client-net-port $CC_CLIENTPORT -cluster-net-port $CC_CLUSTERPORT -max-heartbeat-lapse-periods 999999 -default-max-job-attempts 0 -job-history-size 0 &> $CCLOGS_DIR/cc.log &
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/startnc.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/startnc.sh
new file mode 100644
index 0000000..6e0f90e
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/startnc.sh
@@ -0,0 +1,49 @@
+hostname
+
+MY_NAME=`hostname`
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CURRENT_PATH=`pwd`
+CCHOST=`ssh ${CCHOST_NAME} "cd ${CURRENT_PATH}; bin/getip.sh"`
+
+#Import cluster properties
+. conf/cluster.properties
+
+#Clean up temp dir
+
+rm -rf $NCTMP_DIR
+mkdir $NCTMP_DIR
+
+#Clean up log dir
+rm -rf $NCLOGS_DIR
+mkdir $NCLOGS_DIR
+
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS | tr "," "\n")
+for io_dir in $io_dirs
+do
+ rm -rf $io_dir
+ mkdir $io_dir
+done
+
+#Set JAVA_HOME
+export JAVA_HOME=$JAVA_HOME
+
+IPADDR=`bin/getip.sh`
+#echo $IPADDR
+
+#Get node ID
+NODEID=`hostname | cut -d '.' -f 1`
+
+#Set JAVA_OPTS
+export JAVA_OPTS=$NCJAVA_OPTS
+
+cd $HYRACKS_HOME
+HYRACKS_HOME=`pwd`
+
+#Enter the temp dir
+cd $NCTMP_DIR
+
+#Launch hyracks nc
+$HYRACKS_HOME/hyracks-server/target/appassembler/bin/hyracksnc -cc-host $CCHOST -cc-port $CC_CLUSTERPORT -cluster-net-ip-address $IPADDR -data-ip-address $IPADDR -node-id $NODEID -iodevices "${IO_DIRS}" &> $NCLOGS_DIR/$NODEID.log &
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/stopAllNCs.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopAllNCs.sh
new file mode 100644
index 0000000..12367c1
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopAllNCs.sh
@@ -0,0 +1,6 @@
+PREGELIX_PATH=`pwd`
+
+for i in `cat conf/slaves`
+do
+ ssh $i "cd ${PREGELIX_PATH}; bin/stopnc.sh"
+done
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/stopCluster.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopCluster.sh
new file mode 100644
index 0000000..4889934
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopCluster.sh
@@ -0,0 +1,3 @@
+bin/stopAllNCs.sh
+sleep 2
+bin/stopcc.sh
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/stopcc.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopcc.sh
new file mode 100644
index 0000000..c2f525a
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopcc.sh
@@ -0,0 +1,10 @@
+hostname
+. conf/cluster.properties
+
+#Kill process
+PID=`ps -ef|grep ${USER}|grep java|grep hyracks|awk '{print $2}'`
+echo $PID
+kill -9 $PID
+
+#Clean up CC temp dir
+rm -rf $CCTMP_DIR/*
diff --git a/hivesterix/hivesterix-dist/src/main/resources/scripts/stopnc.sh b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopnc.sh
new file mode 100644
index 0000000..03ce4e7
--- /dev/null
+++ b/hivesterix/hivesterix-dist/src/main/resources/scripts/stopnc.sh
@@ -0,0 +1,23 @@
+hostname
+. conf/cluster.properties
+
+#Kill process
+PID=`ps -ef|grep ${USER}|grep java|grep 'Dapp.name=hyracksnc'|awk '{print $2}'`
+
+if [ "$PID" == "" ]; then
+ USERID=`id | sed 's/^uid=//;s/(.*$//'`
+ PID=`ps -ef|grep ${USERID}|grep java|grep 'Dapp.name=hyracksnc'|awk '{print $2}'`
+fi
+
+echo $PID
+kill -9 $PID
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS | tr "," "\n")
+for io_dir in $io_dirs
+do
+ rm -rf $io_dir/*
+done
+
+#Clean up NC temp dir
+rm -rf $NCTMP_DIR/*