fix some merge problems
diff --git a/genomix/HyracksCodeFormatProfile.xml b/genomix/HyracksCodeFormatProfile.xml
new file mode 100644
index 0000000..2cde66d
--- /dev/null
+++ b/genomix/HyracksCodeFormatProfile.xml
@@ -0,0 +1,279 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<profiles version="11">
+<profile kind="CodeFormatterProfile" name="HyracksCodeFormatProfile" version="11">
+<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.disabling_tag" value="@formatter:off"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_field" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.use_on_off_tags" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_ellipsis" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_multiple_fields" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_conditional_expression" value="80"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_array_initializer" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_package" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.continuation_indentation" value="2"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_package" value="0"/>
+<setting id="org.eclipse.jdt.core.compiler.source" value="1.5"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_line_comments" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.join_wrapped_lines" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_member_type" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.align_type_members_on_columns" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.indent_parameter_description" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.lineSplit" value="120"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.indentation.size" value="4"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.enabling_tag" value="@formatter:on"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_assignment" value="0"/>
+<setting id="org.eclipse.jdt.core.compiler.problem.assertIdentifier" value="error"/>
+<setting id="org.eclipse.jdt.core.formatter.tabulation.char" value="space"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_body" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_method" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_method_declaration" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.compiler.problem.enumIdentifier" value="error"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_ellipsis" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_method_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.compact_else_if" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_constant" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.indent_root_tags" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.tabulation.size" value="4"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_empty_lines" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression" value="16"/>
+<setting id="org.eclipse.jdt.core.compiler.compliance" value="1.5"/>
+<setting id="org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer" value="2"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_binary_expression" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode" value="enabled"/>
+<setting id="org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_label" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant" value="48"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_javadoc_comments" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.line_length" value="9999"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_import_groups" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.wrap_before_binary_operator" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_block" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.join_lines_in_comments" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_compact_if" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_imports" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_html" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_source_code" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.compiler.codegen.targetPlatform" value="1.5"/>
+<setting id="org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_member" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_header" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_block_comments" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_enum_constants" value="49"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_type_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_imports" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
+</profile>
+</profiles>
diff --git a/genomix/genomix-data/.classpath b/genomix/genomix-data/.classpath
new file mode 100644
index 0000000..6e26406
--- /dev/null
+++ b/genomix/genomix-data/.classpath
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" output="target/classes" path="src/main/java"/>
+ <classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources"/>
+ <classpathentry kind="src" output="target/test-classes" path="src/test/java"/>
+ <classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
+ <classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"/>
+ <classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/genomix/genomix-data/.project b/genomix/genomix-data/.project
new file mode 100644
index 0000000..f22376e
--- /dev/null
+++ b/genomix/genomix-data/.project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>genomix-data</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ <buildCommand>
+ <name>org.eclipse.m2e.core.maven2Builder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ <nature>org.eclipse.m2e.core.maven2Nature</nature>
+ </natures>
+</projectDescription>
diff --git a/genomix/genomix-data/.settings/org.eclipse.core.resources.prefs b/genomix/genomix-data/.settings/org.eclipse.core.resources.prefs
new file mode 100644
index 0000000..609d3ca
--- /dev/null
+++ b/genomix/genomix-data/.settings/org.eclipse.core.resources.prefs
@@ -0,0 +1,4 @@
+eclipse.preferences.version=1
+encoding//src/main/resources=UTF-8
+encoding//src/test/resources=UTF-8
+encoding/<project>=UTF-8
diff --git a/genomix/genomix-data/.settings/org.eclipse.jdt.core.prefs b/genomix/genomix-data/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..ec4300d
--- /dev/null
+++ b/genomix/genomix-data/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,5 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.compliance=1.7
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.7
diff --git a/genomix/genomix-data/.settings/org.eclipse.m2e.core.prefs b/genomix/genomix-data/.settings/org.eclipse.m2e.core.prefs
new file mode 100644
index 0000000..f897a7f
--- /dev/null
+++ b/genomix/genomix-data/.settings/org.eclipse.m2e.core.prefs
@@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
diff --git a/genomix/genomix-data/HyracksCodeFormatProfile.xml b/genomix/genomix-data/HyracksCodeFormatProfile.xml
new file mode 100644
index 0000000..2cde66d
--- /dev/null
+++ b/genomix/genomix-data/HyracksCodeFormatProfile.xml
@@ -0,0 +1,279 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<profiles version="11">
+<profile kind="CodeFormatterProfile" name="HyracksCodeFormatProfile" version="11">
+<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.disabling_tag" value="@formatter:off"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_field" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.use_on_off_tags" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_ellipsis" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_multiple_fields" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_conditional_expression" value="80"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_array_initializer" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_package" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.continuation_indentation" value="2"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_package" value="0"/>
+<setting id="org.eclipse.jdt.core.compiler.source" value="1.5"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_line_comments" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.join_wrapped_lines" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_member_type" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.align_type_members_on_columns" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.indent_parameter_description" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.lineSplit" value="120"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.indentation.size" value="4"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.enabling_tag" value="@formatter:on"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_assignment" value="0"/>
+<setting id="org.eclipse.jdt.core.compiler.problem.assertIdentifier" value="error"/>
+<setting id="org.eclipse.jdt.core.formatter.tabulation.char" value="space"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_body" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_method" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_method_declaration" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.compiler.problem.enumIdentifier" value="error"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_ellipsis" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_method_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.compact_else_if" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_constant" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.indent_root_tags" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.tabulation.size" value="4"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_empty_lines" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression" value="16"/>
+<setting id="org.eclipse.jdt.core.compiler.compliance" value="1.5"/>
+<setting id="org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer" value="2"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_binary_expression" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode" value="enabled"/>
+<setting id="org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_label" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant" value="48"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_javadoc_comments" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.line_length" value="9999"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_import_groups" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.wrap_before_binary_operator" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_block" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.join_lines_in_comments" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_compact_if" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_imports" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_html" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_source_code" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration" value="16"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.compiler.codegen.targetPlatform" value="1.5"/>
+<setting id="org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation" value="0"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_member" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_header" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.format_block_comments" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.alignment_for_enum_constants" value="49"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.brace_position_for_type_declaration" value="end_of_line"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_imports" value="1"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header" value="true"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
+<setting id="org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments" value="do not insert"/>
+<setting id="org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column" value="false"/>
+<setting id="org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
+</profile>
+</profiles>
diff --git a/genomix/genomix-data/pom.xml b/genomix/genomix-data/pom.xml
new file mode 100644
index 0000000..2dd44bb
--- /dev/null
+++ b/genomix/genomix-data/pom.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <artifactId>genomix-data</artifactId>
+ <name>genomix-data</name>
+
+ <parent>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>genomix</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ </parent>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.0.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ <fork>true</fork>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-core</artifactId>
+ <version>0.20.2</version>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/genomix/genomix-data/src/main/assembly/binary-assembly.xml b/genomix/genomix-data/src/main/assembly/binary-assembly.xml
new file mode 100644
index 0000000..68d424a
--- /dev/null
+++ b/genomix/genomix-data/src/main/assembly/binary-assembly.xml
@@ -0,0 +1,19 @@
+<assembly>
+ <id>binary-assembly</id>
+ <formats>
+ <format>zip</format>
+ <format>dir</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <fileSets>
+ <fileSet>
+ <directory>target/appassembler/bin</directory>
+ <outputDirectory>bin</outputDirectory>
+ <fileMode>0755</fileMode>
+ </fileSet>
+ <fileSet>
+ <directory>target/appassembler/lib</directory>
+ <outputDirectory>lib</outputDirectory>
+ </fileSet>
+ </fileSets>
+</assembly>
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java
new file mode 100644
index 0000000..5be5f83
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/GeneCode.java
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type;
+
+public class GeneCode {
+ public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
+ /**
+ * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL
+ * }
+ */
+ public static final byte A = 0;
+ public static final byte C = 1;
+ public static final byte G = 2;
+ public static final byte T = 3;
+
+ public static byte getCodeFromSymbol(byte ch) {
+ byte r = 0;
+ switch (ch) {
+ case 'A':
+ case 'a':
+ r = A;
+ break;
+ case 'C':
+ case 'c':
+ r = C;
+ break;
+ case 'G':
+ case 'g':
+ r = G;
+ break;
+ case 'T':
+ case 't':
+ r = T;
+ break;
+ }
+ return r;
+ }
+
+ public static byte getSymbolFromCode(byte code) {
+ if (code > 3) {
+ return '!';
+ }
+ return GENE_SYMBOL[code];
+ }
+
+ public static byte getAdjBit(byte t) {
+ byte r = 0;
+ switch (t) {
+ case 'A':
+ case 'a':
+ r = 1 << A;
+ break;
+ case 'C':
+ case 'c':
+ r = 1 << C;
+ break;
+ case 'G':
+ case 'g':
+ r = 1 << G;
+ break;
+ case 'T':
+ case 't':
+ r = 1 << T;
+ break;
+ }
+ return r;
+ }
+
+ /**
+ * It works for path merge. Merge the kmer by his next, we need to make sure
+ * the @{t} is a single neighbor.
+ *
+ * @param t
+ * the neighbor code in BitMap
+ * @return the genecode
+ */
+ public static byte getGeneCodeFromBitMap(byte t) {
+ switch (t) {
+ case 1 << A:
+ return A;
+ case 1 << C:
+ return C;
+ case 1 << G:
+ return G;
+ case 1 << T:
+ return T;
+ }
+ return -1;
+ }
+
+ public static byte getBitMapFromGeneCode(byte t) {
+ return (byte) (1 << t);
+ }
+
+ public static int countNumberOfBitSet(int i) {
+ int c = 0;
+ for (; i != 0; c++) {
+ i &= i - 1;
+ }
+ return c;
+ }
+
+ public static int inDegree(byte bitmap) {
+ return countNumberOfBitSet((bitmap >> 4) & 0x0f);
+ }
+
+ public static int outDegree(byte bitmap) {
+ return countNumberOfBitSet(bitmap & 0x0f);
+ }
+
+ public static byte mergePreNextAdj(byte pre, byte next) {
+ return (byte) (pre << 4 | (next & 0x0f));
+ }
+
+ public static String getSymbolFromBitMap(byte code) {
+ int left = (code >> 4) & 0x0F;
+ int right = code & 0x0F;
+ StringBuilder str = new StringBuilder();
+ for (int i = A; i <= T; i++) {
+ if ((left & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ str.append('|');
+ for (int i = A; i <= T; i++) {
+ if ((right & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ return str.toString();
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
new file mode 100644
index 0000000..fd4c252
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerBytesWritable.java
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+/**
+ * Fix kmer length byteswritable
+ * It was used to generate the graph in which phase the kmer length doesn't change.
+ * Thus the size of bytes doesn't change either.
+ */
+public class KmerBytesWritable extends BinaryComparable implements Serializable, WritableComparable<BinaryComparable> {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ private static final byte[] EMPTY_BYTES = {};
+
+ protected int size;
+ protected byte[] bytes;
+ protected int kmerlength;
+
+ @Deprecated
+ public KmerBytesWritable() {
+ this(0, EMPTY_BYTES);
+ }
+
+ public KmerBytesWritable(int k, byte[] storage) {
+ this.kmerlength = k;
+ if (k > 0) {
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ this.bytes = storage;
+ if (this.bytes.length < size) {
+ throw new ArrayIndexOutOfBoundsException("Storage is smaller than required space for kmerlength:k");
+ }
+ } else {
+ this.bytes = storage;
+ this.size = 0;
+ }
+ }
+
+ /**
+ * Initial Kmer space by kmerlength
+ *
+ * @param k
+ * kmerlength
+ */
+ public KmerBytesWritable(int k) {
+ this.kmerlength = k;
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ if (k > 0) {
+ this.bytes = new byte[this.size];
+ } else {
+ this.bytes = EMPTY_BYTES;
+ }
+ }
+
+ public KmerBytesWritable(KmerBytesWritable right) {
+ if (right != null) {
+ this.kmerlength = right.kmerlength;
+ this.size = right.size;
+ this.bytes = new byte[right.size];
+ set(right);
+ }else{
+ this.kmerlength = 0;
+ this.size = 0;
+ this.bytes = EMPTY_BYTES;
+ }
+ }
+
+ public byte getGeneCodeAtPosition(int pos) {
+ if (pos >= kmerlength) {
+ return -1;
+ }
+ int posByte = pos / 4;
+ int shift = (pos % 4) << 1;
+ return (byte) ((bytes[size - 1 - posByte] >> shift) & 0x3);
+ }
+
+ public int getKmerLength() {
+ return this.kmerlength;
+ }
+
+ @Override
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ @Override
+ public int getLength() {
+ return size;
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = this.size - 1;
+ for (int i = start; i < start + kmerlength && i < array.length; i++) {
+ byte code = GeneCode.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(byte[] array, int start) {
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = size - 1;
+ for (int i = start + kmerlength - 1; i >= 0 && i < array.length; i--) {
+ byte code = GeneCode.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ }
+
+ /**
+ * Shift Kmer to accept new char input
+ *
+ * @param c
+ * Input new gene character
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextChar(byte c) {
+ return shiftKmerWithNextCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shift out gene, in gene code format
+ */
+ public byte shiftKmerWithNextCode(byte c) {
+ byte output = (byte) (bytes[size - 1] & 0x03);
+ for (int i = size - 1; i > 0; i--) {
+ byte in = (byte) (bytes[i - 1] & 0x03);
+ bytes[i] = (byte) (((bytes[i] >>> 2) & 0x3f) | (in << 6));
+ }
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte code = (byte) (c << pos);
+ bytes[0] = (byte) (((bytes[0] >>> 2) & 0x3f) | code);
+ clearLeadBit();
+ return output;
+ }
+
+ /**
+ * Shift Kmer to accept new input char
+ *
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreChar(byte c) {
+ return shiftKmerWithPreCode(GeneCode.getCodeFromSymbol(c));
+ }
+
+ /**
+ * Shift Kmer to accept new gene code
+ *
+ * @param c
+ * Input new gene code
+ * @return the shiftout gene, in gene code format
+ */
+ public byte shiftKmerWithPreCode(byte c) {
+ int pos = ((kmerlength - 1) % 4) << 1;
+ byte output = (byte) ((bytes[0] >> pos) & 0x03);
+ for (int i = 0; i < size - 1; i++) {
+ byte in = (byte) ((bytes[i + 1] >> 6) & 0x03);
+ bytes[i] = (byte) ((bytes[i] << 2) | in);
+ }
+ bytes[size - 1] = (byte) ((bytes[size - 1] << 2) | c);
+ clearLeadBit();
+ return output;
+ }
+
+ protected void clearLeadBit() {
+ if (kmerlength % 4 != 0) {
+ bytes[0] &= (1 << ((kmerlength % 4) << 1)) - 1;
+ }
+ }
+
+ public void set(KmerBytesWritable newData) {
+ if (kmerlength != newData.kmerlength){
+ throw new IllegalArgumentException("kmerSize is different, try to use VKmerBytesWritable instead");
+ }
+ if (kmerlength > 0 ){
+ set(newData.bytes, 0, newData.size);
+ }
+ }
+
+ public void set(byte[] newData, int offset, int length) {
+ if (kmerlength > 0){
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
+ }
+
+ /**
+ * Don't read the kmerlength from datastream,
+ * Read it from configuration
+ */
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.kmerlength = in.readInt();
+ this.size = KmerUtil.getByteNumFromK(kmerlength);
+ if (this.kmerlength > 0) {
+ if (this.bytes.length < this.size) {
+ this.bytes = new byte[this.size];
+ }
+ in.readFully(bytes, 0, size);
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(kmerlength);
+ if (kmerlength > 0) {
+ out.write(bytes, 0, size);
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return super.hashCode() * 31 + this.kmerlength;
+ }
+
+ @Override
+ public boolean equals(Object right_obj) {
+ if (right_obj instanceof KmerBytesWritable)
+ return this.kmerlength == ((KmerBytesWritable) right_obj).kmerlength && super.equals(right_obj);
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return KmerUtil.recoverKmerFrom(this.kmerlength, this.getBytes(), 0, this.getLength());
+ }
+
+ public static class Comparator extends WritableComparator {
+ public final int LEAD_BYTES = 4;
+
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int kmerlength1 = readInt(b1, s1);
+ int kmerlength2 = readInt(b2, s2);
+ if (kmerlength1 == kmerlength2) {
+ return compareBytes(b1, s1 + LEAD_BYTES, l1 - LEAD_BYTES, b2, s2 + LEAD_BYTES, l2 - LEAD_BYTES);
+ }
+ return kmerlength1 - kmerlength2;
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
+
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
new file mode 100644
index 0000000..fab7001
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerCountValue.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.type;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+public class KmerCountValue implements Writable {
+ private byte adjBitMap;
+ private byte count;
+
+ public KmerCountValue(byte bitmap, byte count) {
+ set(bitmap, count);
+ }
+
+ public KmerCountValue() {
+ adjBitMap = 0;
+ count = 0;
+ }
+
+ @Override
+ public void readFields(DataInput arg0) throws IOException {
+ adjBitMap = arg0.readByte();
+ count = arg0.readByte();
+ }
+
+ @Override
+ public void write(DataOutput arg0) throws IOException {
+ arg0.writeByte(adjBitMap);
+ arg0.writeByte(count);
+ }
+
+ @Override
+ public String toString() {
+ return GeneCode.getSymbolFromBitMap(adjBitMap) + '\t' + String.valueOf(count);
+ }
+
+ public void set(byte bitmap, byte count) {
+ this.adjBitMap = bitmap;
+ this.count = count;
+ }
+
+ public byte getAdjBitMap() {
+ return adjBitMap;
+ }
+
+ public void setAdjBitMap(byte adjBitMap) {
+ this.adjBitMap = adjBitMap;
+ }
+
+ public byte getCount() {
+ return count;
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
new file mode 100644
index 0000000..68dec30
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/KmerUtil.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type;
+
+public class KmerUtil {
+ public static final String empty = "";
+
+ public static int getByteNumFromK(int k) {
+ int x = k / 4;
+ if (k % 4 != 0) {
+ x += 1;
+ }
+ return x;
+ }
+
+ public static byte reverseKmerByte(byte k) {
+ int x = (((k >> 2) & 0x33) | ((k << 2) & 0xcc));
+ return (byte) (((x >> 4) & 0x0f) | ((x << 4) & 0xf0));
+ }
+
+ public static String recoverKmerFrom(int k, byte[] keyData, int keyStart, int keyLength) {
+ StringBuilder strKmer = new StringBuilder();
+ int byteId = keyStart + keyLength - 1;
+ if (byteId < 0) {
+ return empty;
+ }
+ byte currentbyte = keyData[byteId];
+ for (int geneCount = 0; geneCount < k; geneCount++) {
+ if (geneCount % 4 == 0 && geneCount > 0) {
+ currentbyte = keyData[--byteId];
+ }
+ strKmer.append((char) GeneCode.GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03]);
+ }
+ return strKmer.toString();
+ }
+
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
new file mode 100644
index 0000000..abedad6
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritable.java
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type;
+
+public class VKmerBytesWritable extends KmerBytesWritable {
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ @Deprecated
+ public VKmerBytesWritable() {
+ super();
+ }
+
+ public VKmerBytesWritable(int k, byte[] storage) {
+ super(k, storage);
+ }
+
+ public VKmerBytesWritable(int k) {
+ super(k);
+ }
+
+ public VKmerBytesWritable(KmerBytesWritable other) {
+ super(other);
+ }
+
+ protected void setSize(int size) {
+ if (size > getCapacity()) {
+ setCapacity((size * 3 / 2));
+ }
+ this.size = size;
+ }
+
+ protected int getCapacity() {
+ return bytes.length;
+ }
+
+ protected void setCapacity(int new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < size) {
+ size = new_cap;
+ }
+ if (size != 0) {
+ System.arraycopy(bytes, 0, new_data, 0, size);
+ }
+ bytes = new_data;
+ }
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public void setByRead(int k, byte[] array, int start) {
+ reset(k);
+ super.setByRead(array, start);
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param input
+ * array
+ * @param start
+ * position
+ */
+ public void setByReadReverse(int k, byte[] array, int start) {
+ reset(k);
+ super.setByReadReverse(array, start);
+ }
+
+ @Override
+ public void set(KmerBytesWritable newData) {
+ if (newData == null){
+ this.set(0,null,0,0);
+ }else{
+ this.set(newData.kmerlength, newData.bytes, 0, newData.size);
+ }
+ }
+
+ public void set(int k, byte[] newData, int offset, int length) {
+ reset(k);
+ if (k > 0 ){
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
+ }
+
+ /**
+ * Reset array by kmerlength
+ *
+ * @param k
+ */
+ public void reset(int k) {
+ this.kmerlength = k;
+ setSize(0);
+ setSize(KmerUtil.getByteNumFromK(k));
+ clearLeadBit();
+ }
+
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
new file mode 100644
index 0000000..c00967f
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/VKmerBytesWritableFactory.java
@@ -0,0 +1,311 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type;
+
+public class VKmerBytesWritableFactory {
+ private VKmerBytesWritable kmer;
+
+ public VKmerBytesWritableFactory(int k) {
+ kmer = new VKmerBytesWritable(k);
+ }
+
+ /**
+ * Read Kmer from read text into bytes array e.g. AATAG will compress as
+ * [0x000G, 0xATAA]
+ *
+ * @param k
+ * @param array
+ * @param start
+ */
+ public VKmerBytesWritable getKmerByRead(int k, byte[] array, int start) {
+ kmer.setByRead(k, array, start);
+ return kmer;
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param array
+ * @param start
+ */
+ public VKmerBytesWritable getKmerByReadReverse(int k, byte[] array, int start) {
+ kmer.setByReadReverse(k, array, start);
+ return kmer;
+ }
+
+ /**
+ * Get last kmer from kmer-chain.
+ * e.g. kmerChain is AAGCTA, if k =5, it will
+ * return AGCTA
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return LastKmer bytes array
+ */
+ public VKmerBytesWritable getLastKmerFromChain(int lastK, final KmerBytesWritable kmerChain) {
+ if (lastK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (lastK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(lastK);
+
+ /** from end to start */
+ int byteInChain = kmerChain.getLength() - 1 - (kmerChain.getKmerLength() - lastK) / 4;
+ int posInByteOfChain = ((kmerChain.getKmerLength() - lastK) % 4) << 1; // *2
+ int byteInKmer = kmer.getLength() - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ /**
+ * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
+ * return AAGCT
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return FirstKmer bytes array
+ */
+ public VKmerBytesWritable getFirstKmerFromChain(int firstK, final KmerBytesWritable kmerChain) {
+ if (firstK > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (firstK == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(firstK);
+
+ int i = 1;
+ for (; i < kmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ }
+ int posInByteOfChain = (firstK % 4) << 1; // *2
+ if (posInByteOfChain == 0) {
+ kmer.getBytes()[0] = kmerChain.getBytes()[kmerChain.getLength() - i];
+ } else {
+ kmer.getBytes()[0] = (byte) (kmerChain.getBytes()[kmerChain.getLength() - i] & ((1 << posInByteOfChain) - 1));
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ public VKmerBytesWritable getSubKmerFromChain(int startK, int kSize, final KmerBytesWritable kmerChain) {
+ if (startK + kSize > kmerChain.getKmerLength()) {
+ return null;
+ }
+ if (startK == 0 && kSize == kmerChain.getKmerLength()) {
+ kmer.set(kmerChain);
+ return kmer;
+ }
+ kmer.reset(kSize);
+
+ /** from end to start */
+ int byteInChain = kmerChain.getLength() - 1 - startK / 4;
+ int posInByteOfChain = startK % 4 << 1; // *2
+ int byteInKmer = kmer.getLength() - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer.getBytes()[byteInKmer] = (byte) ((0xff & kmerChain.getBytes()[byteInChain]) >> posInByteOfChain);
+ kmer.getBytes()[byteInKmer] |= ((kmerChain.getBytes()[byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer.getBytes()[0] = (byte) ((kmerChain.getBytes()[0] & 0xff) >> posInByteOfChain);
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ /**
+ * Merge kmer with next neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAGCTA
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param nextCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer, this K of this Kmer is k+1
+ */
+ public VKmerBytesWritable mergeKmerWithNextCode(final KmerBytesWritable kmer, byte nextCode) {
+ this.kmer.reset(kmer.getKmerLength() + 1);
+ for (int i = 1; i <= kmer.getLength(); i++) {
+ this.kmer.getBytes()[this.kmer.getLength() - i] = kmer.getBytes()[kmer.getLength() - i];
+ }
+ if (this.kmer.getLength() > kmer.getLength()) {
+ this.kmer.getBytes()[0] = (byte) (nextCode & 0x3);
+ } else {
+ this.kmer.getBytes()[0] = (byte) (kmer.getBytes()[0] | ((nextCode & 0x3) << ((kmer.getKmerLength() % 4) << 1)));
+ }
+ this.kmer.clearLeadBit();
+ return this.kmer;
+ }
+
+ /**
+ * Merge kmer with previous neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAAGCT
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param preCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer,this K of this Kmer is k+1
+ */
+ public VKmerBytesWritable mergeKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ this.kmer.reset(kmer.getKmerLength() + 1);
+ int byteInMergedKmer = 0;
+ if (kmer.getKmerLength() % 4 == 0) {
+ this.kmer.getBytes()[0] = (byte) ((kmer.getBytes()[0] >> 6) & 0x3);
+ byteInMergedKmer++;
+ }
+ for (int i = 0; i < kmer.getLength() - 1; i++, byteInMergedKmer++) {
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[i] << 2) | ((kmer.getBytes()[i + 1] >> 6) & 0x3));
+ }
+ this.kmer.getBytes()[byteInMergedKmer] = (byte) ((kmer.getBytes()[kmer.getLength() - 1] << 2) | (preCode & 0x3));
+ this.kmer.clearLeadBit();
+ return this.kmer;
+ }
+
+ /**
+ * Merge two kmer to one kmer
+ * e.g. ACTA + ACCGT => ACTAACCGT
+ *
+ * @param preK
+ * : previous k of kmer
+ * @param kmerPre
+ * : bytes array of previous kmer
+ * @param nextK
+ * : next k of kmer
+ * @param kmerNext
+ * : bytes array of next kmer
+ * @return merged kmer, the new k is @preK + @nextK
+ */
+ public VKmerBytesWritable mergeTwoKmer(final KmerBytesWritable preKmer, final KmerBytesWritable nextKmer) {
+ kmer.reset(preKmer.getKmerLength() + nextKmer.getKmerLength());
+ int i = 1;
+ for (; i <= preKmer.getLength(); i++) {
+ kmer.getBytes()[kmer.getLength() - i] = preKmer.getBytes()[preKmer.getLength() - i];
+ }
+ if (i > 1) {
+ i--;
+ }
+ if (preKmer.getKmerLength() % 4 == 0) {
+ for (int j = 1; j <= nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = nextKmer.getBytes()[nextKmer.getLength() - j];
+ }
+ } else {
+ int posNeedToMove = ((preKmer.getKmerLength() % 4) << 1);
+ kmer.getBytes()[kmer.getLength() - i] |= nextKmer.getBytes()[nextKmer.getLength() - 1] << posNeedToMove;
+ for (int j = 1; j < nextKmer.getLength(); j++) {
+ kmer.getBytes()[kmer.getLength() - i - j] = (byte) (((nextKmer.getBytes()[nextKmer.getLength() - j] & 0xff) >> (8 - posNeedToMove)) | (nextKmer
+ .getBytes()[nextKmer.getLength() - j - 1] << posNeedToMove));
+ }
+ if (nextKmer.getKmerLength() % 4 == 0 || (nextKmer.getKmerLength() % 4) * 2 + posNeedToMove > 8) {
+ kmer.getBytes()[0] = (byte) ((0xff & nextKmer.getBytes()[0]) >> (8 - posNeedToMove));
+ }
+ }
+ kmer.clearLeadBit();
+ return kmer;
+ }
+
+ /**
+ * Safely shifted the kmer forward without change the input kmer
+ * e.g. AGCGC shift with T => GCGCT
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param afterCode
+ * : input genecode
+ * @return new created kmer that shifted by afterCode, the K will not change
+ */
+ public VKmerBytesWritable shiftKmerWithNextCode(final KmerBytesWritable kmer, byte afterCode) {
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithNextCode(afterCode);
+ return this.kmer;
+ }
+
+ /**
+ * Safely shifted the kmer backward without change the input kmer
+ * e.g. AGCGC shift with T => TAGCG
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param preCode
+ * : input genecode
+ * @return new created kmer that shifted by preCode, the K will not change
+ */
+ public VKmerBytesWritable shiftKmerWithPreCode(final KmerBytesWritable kmer, byte preCode) {
+ this.kmer.set(kmer);
+ this.kmer.shiftKmerWithPreCode(preCode);
+ return this.kmer;
+ }
+
+ /**
+ * get the reverse sequence of given kmer
+ *
+ * @param kmer
+ */
+ public VKmerBytesWritable reverse(final KmerBytesWritable kmer) {
+ this.kmer.reset(kmer.getKmerLength());
+
+ int curPosAtKmer = ((kmer.getKmerLength() - 1) % 4) << 1;
+ int curByteAtKmer = 0;
+
+ int curPosAtReverse = 0;
+ int curByteAtReverse = this.kmer.getLength() - 1;
+ this.kmer.getBytes()[curByteAtReverse] = 0;
+ for (int i = 0; i < kmer.getKmerLength(); i++) {
+ byte gene = (byte) ((kmer.getBytes()[curByteAtKmer] >> curPosAtKmer) & 0x03);
+ this.kmer.getBytes()[curByteAtReverse] |= gene << curPosAtReverse;
+ curPosAtReverse += 2;
+ if (curPosAtReverse >= 8) {
+ curPosAtReverse = 0;
+ this.kmer.getBytes()[--curByteAtReverse] = 0;
+ }
+ curPosAtKmer -= 2;
+ if (curPosAtKmer < 0) {
+ curPosAtKmer = 6;
+ curByteAtKmer++;
+ }
+ }
+ this.kmer.clearLeadBit();
+ return this.kmer;
+ }
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/Kmer.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/Kmer.java
new file mode 100644
index 0000000..7aa05f5
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/Kmer.java
@@ -0,0 +1,299 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.type.old;
+
+@Deprecated
+public class Kmer {
+
+ public final static byte[] GENE_SYMBOL = { 'A', 'C', 'G', 'T' };
+
+ public final static class GENE_CODE {
+
+ /**
+ * make sure this 4 ids equal to the sequence id of char in {@GENE_SYMBOL}
+ */
+ public static final byte A = 0;
+ public static final byte C = 1;
+ public static final byte G = 2;
+ public static final byte T = 3;
+
+ public static byte getCodeFromSymbol(byte ch) {
+ byte r = 0;
+ switch (ch) {
+ case 'A':
+ case 'a':
+ r = A;
+ break;
+ case 'C':
+ case 'c':
+ r = C;
+ break;
+ case 'G':
+ case 'g':
+ r = G;
+ break;
+ case 'T':
+ case 't':
+ r = T;
+ break;
+ }
+ return r;
+ }
+
+ public static byte getSymbolFromCode(byte code) {
+ if (code > 3) {
+ return '!';
+ }
+ return GENE_SYMBOL[code];
+ }
+
+ public static byte getAdjBit(byte t) {
+ byte r = 0;
+ switch (t) {
+ case 'A':
+ case 'a':
+ r = 1 << A;
+ break;
+ case 'C':
+ case 'c':
+ r = 1 << C;
+ break;
+ case 'G':
+ case 'g':
+ r = 1 << G;
+ break;
+ case 'T':
+ case 't':
+ r = 1 << T;
+ break;
+ }
+ return r;
+ }
+
+ /**
+ * It works for path merge.
+ * Merge the kmer by his next, we need to make sure the @{t} is a single neighbor.
+ *
+ * @param t
+ * the neighbor code in BitMap
+ * @return the genecode
+ */
+ public static byte getGeneCodeFromBitMap(byte t) {
+ switch (t) {
+ case 1 << A:
+ return A;
+ case 1 << C:
+ return C;
+ case 1 << G:
+ return G;
+ case 1 << T:
+ return T;
+ }
+ return -1;
+ }
+
+ public static byte mergePreNextAdj(byte pre, byte next) {
+ return (byte) (pre << 4 | (next & 0x0f));
+ }
+
+ public static String getSymbolFromBitMap(byte code) {
+ int left = (code >> 4) & 0x0F;
+ int right = code & 0x0F;
+ StringBuilder str = new StringBuilder();
+ for (int i = A; i <= T; i++) {
+ if ((left & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ str.append('|');
+ for (int i = A; i <= T; i++) {
+ if ((right & (1 << i)) != 0) {
+ str.append((char) GENE_SYMBOL[i]);
+ }
+ }
+ return str.toString();
+ }
+ }
+
+ public static String recoverKmerFrom(int k, byte[] keyData, int keyStart, int keyLength) {
+ StringBuilder strKmer = new StringBuilder();
+ int byteId = keyStart + keyLength - 1;
+ byte currentbyte = keyData[byteId];
+ for (int geneCount = 0; geneCount < k; geneCount++) {
+ if (geneCount % 4 == 0 && geneCount > 0) {
+ currentbyte = keyData[--byteId];
+ }
+ strKmer.append((char) GENE_SYMBOL[(currentbyte >> ((geneCount % 4) * 2)) & 0x03]);
+ }
+ return strKmer.toString();
+ }
+
+ public static int getByteNumFromK(int k) {
+ int x = k / 4;
+ if (k % 4 != 0) {
+ x += 1;
+ }
+ return x;
+ }
+
+ /**
+ * Compress Kmer into bytes array AATAG will compress as [0x000G, 0xATAA]
+ *
+ * @param kmer
+ * @param input
+ * array
+ * @param start
+ * position
+ * @return initialed kmer array
+ */
+ public static byte[] compressKmer(int k, byte[] array, int start) {
+ final int byteNum = getByteNumFromK(k);
+ byte[] bytes = new byte[byteNum];
+
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = byteNum - 1;
+ for (int i = start; i < start + k; i++) {
+ byte code = GENE_CODE.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ return bytes;
+ }
+
+ /**
+ * Shift Kmer to accept new input
+ *
+ * @param kmer
+ * @param bytes
+ * Kmer Array
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public static byte moveKmer(int k, byte[] kmer, byte c) {
+ int byteNum = kmer.length;
+ byte output = (byte) (kmer[byteNum - 1] & 0x03);
+ for (int i = byteNum - 1; i > 0; i--) {
+ byte in = (byte) (kmer[i - 1] & 0x03);
+ kmer[i] = (byte) (((kmer[i] >>> 2) & 0x3f) | (in << 6));
+ }
+ int pos = ((k - 1) % 4) << 1;
+ byte code = (byte) (GENE_CODE.getCodeFromSymbol(c) << pos);
+ kmer[0] = (byte) (((kmer[0] >>> 2) & 0x3f) | code);
+ return (byte) (1 << output);
+ }
+
+ public static byte reverseKmerByte(byte k) {
+ int x = (((k >> 2) & 0x33) | ((k << 2) & 0xcc));
+ return (byte) (((x >> 4) & 0x0f) | ((x << 4) & 0xf0));
+ }
+
+ public static byte[] reverseKmer(int k, byte[] kmer) {
+ byte[] reverseKmer = new byte[kmer.length];
+
+ int curPosAtKmer = ((k - 1) % 4) << 1;
+ int curByteAtKmer = 0;
+
+ int curPosAtReverse = 0;
+ int curByteAtReverse = reverseKmer.length - 1;
+ reverseKmer[curByteAtReverse] = 0;
+ for (int i = 0; i < k; i++) {
+ byte gene = (byte) ((kmer[curByteAtKmer] >> curPosAtKmer) & 0x03);
+ reverseKmer[curByteAtReverse] |= gene << curPosAtReverse;
+ curPosAtReverse += 2;
+ if (curPosAtReverse >= 8) {
+ curPosAtReverse = 0;
+ reverseKmer[--curByteAtReverse] = 0;
+ }
+ curPosAtKmer -= 2;
+ if (curPosAtKmer < 0) {
+ curPosAtKmer = 6;
+ curByteAtKmer++;
+ }
+ }
+
+ return reverseKmer;
+ }
+
+ /**
+ * Compress Reversed Kmer into bytes array AATAG will compress as
+ * [0x000A,0xATAG]
+ *
+ * @param kmer
+ * @param input
+ * array
+ * @param start
+ * position
+ * @return initialed kmer array
+ */
+ public static byte[] compressKmerReverse(int k, byte[] array, int start) {
+ final int byteNum = getByteNumFromK(k);
+ byte[] bytes = new byte[byteNum];
+
+ byte l = 0;
+ int bytecount = 0;
+ int bcount = byteNum - 1;
+ for (int i = start + k - 1; i >= 0; i--) {
+ byte code = GENE_CODE.getCodeFromSymbol(array[i]);
+ l |= (byte) (code << bytecount);
+ bytecount += 2;
+ if (bytecount == 8) {
+ bytes[bcount--] = l;
+ l = 0;
+ bytecount = 0;
+ }
+ }
+ if (bcount >= 0) {
+ bytes[0] = l;
+ }
+ return bytes;
+ }
+
+ /**
+ * Shift Kmer to accept new input
+ *
+ * @param kmer
+ * @param bytes
+ * Kmer Array
+ * @param c
+ * Input new gene character
+ * @return the shiftout gene, in gene code format
+ */
+ public static byte moveKmerReverse(int k, byte[] kmer, byte c) {
+ int pos = ((k - 1) % 4) << 1;
+ byte output = (byte) ((kmer[0] >> pos) & 0x03);
+ for (int i = 0; i < kmer.length - 1; i++) {
+ byte in = (byte) ((kmer[i + 1] >> 6) & 0x03);
+ kmer[i] = (byte) ((kmer[i] << 2) | in);
+ }
+ // (k%4) * 2
+ if (k % 4 != 0) {
+ kmer[0] &= (1 << ((k % 4) << 1)) - 1;
+ }
+ kmer[kmer.length - 1] = (byte) ((kmer[kmer.length - 1] << 2) | GENE_CODE.getCodeFromSymbol(c));
+ return (byte) (1 << output);
+ }
+
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/KmerBytesWritable.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/KmerBytesWritable.java
new file mode 100644
index 0000000..7a96d2f
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/KmerBytesWritable.java
@@ -0,0 +1,136 @@
+package edu.uci.ics.genomix.type.old;
+
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+@Deprecated
+public class KmerBytesWritable extends BinaryComparable implements WritableComparable<BinaryComparable> {
+ private static final int LENGTH_BYTES = 4;
+ private static final byte[] EMPTY_BYTES = {};
+ private byte size;
+ private byte[] bytes;
+
+ public KmerBytesWritable() {
+ this(EMPTY_BYTES);
+ }
+
+ public KmerBytesWritable(byte[] bytes) {
+ this.bytes = bytes;
+ this.size = (byte) bytes.length;
+ }
+
+ @Override
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ @Deprecated
+ public byte[] get() {
+ return getBytes();
+ }
+
+ @Override
+ public int getLength() {
+ return (int) size;
+ }
+
+ @Deprecated
+ public int getSize() {
+ return getLength();
+ }
+
+ public void setSize(byte size) {
+ if ((int) size > getCapacity()) {
+ setCapacity((byte) (size * 3 / 2));
+ }
+ this.size = size;
+ }
+
+ public int getCapacity() {
+ return bytes.length;
+ }
+
+ public void setCapacity(byte new_cap) {
+ if (new_cap != getCapacity()) {
+ byte[] new_data = new byte[new_cap];
+ if (new_cap < size) {
+ size = new_cap;
+ }
+ if (size != 0) {
+ System.arraycopy(bytes, 0, new_data, 0, size);
+ }
+ bytes = new_data;
+ }
+ }
+
+ public void set(KmerBytesWritable newData) {
+ set(newData.bytes, (byte) 0, newData.size);
+ }
+
+ public void set(byte[] newData, byte offset, byte length) {
+ setSize((byte) 0);
+ setSize(length);
+ System.arraycopy(newData, offset, bytes, 0, size);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ setSize((byte) 0); // clear the old data
+ setSize(in.readByte());
+ in.readFully(bytes, 0, size);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(size);
+ out.write(bytes, 0, size);
+ }
+
+ @Override
+ public int hashCode() {
+ return super.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object right_obj) {
+ if (right_obj instanceof KmerBytesWritable)
+ return super.equals(right_obj);
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ StringBuffer sb = new StringBuffer(3 * size);
+ for (int idx = 0; idx < (int) size; idx++) {
+ // if not the first, put a blank separator in
+ if (idx != 0) {
+ sb.append(' ');
+ }
+ String num = Integer.toHexString(0xff & bytes[idx]);
+ // if it is only one digit, add a leading 0.
+ if (num.length() < 2) {
+ sb.append('0');
+ }
+ sb.append(num);
+ }
+ return sb.toString();
+ }
+
+ public static class Comparator extends WritableComparator {
+ public Comparator() {
+ super(KmerBytesWritable.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ return compareBytes(b1, s1 + LENGTH_BYTES, l1 - LENGTH_BYTES, b2, s2 + LENGTH_BYTES, l2 - LENGTH_BYTES);
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(KmerBytesWritable.class, new Comparator());
+ }
+
+}
diff --git a/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/KmerUtil.java b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/KmerUtil.java
new file mode 100644
index 0000000..a4b1bec
--- /dev/null
+++ b/genomix/genomix-data/src/main/java/edu/uci/ics/genomix/type/old/KmerUtil.java
@@ -0,0 +1,241 @@
+package edu.uci.ics.genomix.type.old;
+
+import java.util.Arrays;
+
+@Deprecated
+public class KmerUtil {
+
+ public static int countNumberOfBitSet(int i) {
+ int c = 0;
+ for (; i != 0; c++) {
+ i &= i - 1;
+ }
+ return c;
+ }
+
+ public static int inDegree(byte bitmap) {
+ return countNumberOfBitSet((bitmap >> 4) & 0x0f);
+ }
+
+ public static int outDegree(byte bitmap) {
+ return countNumberOfBitSet(bitmap & 0x0f);
+ }
+
+ /**
+ * Get last kmer from kmer-chain.
+ * e.g. kmerChain is AAGCTA, if k =5, it will
+ * return AGCTA
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return LastKmer bytes array
+ */
+ public static byte[] getLastKmerFromChain(int k, int kInChain, byte[] kmerChain, int offset, int length) {
+ if (k > kInChain) {
+ return null;
+ }
+ if (k == kInChain) {
+ return kmerChain.clone();
+ }
+ int byteNum = Kmer.getByteNumFromK(k);
+ byte[] kmer = new byte[byteNum];
+
+ /** from end to start */
+ int byteInChain = length - 1 - (kInChain - k) / 4;
+ int posInByteOfChain = ((kInChain - k) % 4) << 1; // *2
+ int byteInKmer = byteNum - 1;
+ for (; byteInKmer >= 0 && byteInChain > 0; byteInKmer--, byteInChain--) {
+ kmer[byteInKmer] = (byte) ((0xff & kmerChain[offset + byteInChain]) >> posInByteOfChain);
+ kmer[byteInKmer] |= ((kmerChain[offset + byteInChain - 1] << (8 - posInByteOfChain)));
+ }
+
+ /** last kmer byte */
+ if (byteInKmer == 0) {
+ kmer[0] = (byte) ((kmerChain[offset] & 0xff) >> posInByteOfChain);
+ }
+ return kmer;
+ }
+
+ /**
+ * Get first kmer from kmer-chain e.g. kmerChain is AAGCTA, if k=5, it will
+ * return AAGCT
+ *
+ * @param k
+ * @param kInChain
+ * @param kmerChain
+ * @return FirstKmer bytes array
+ */
+ public static byte[] getFirstKmerFromChain(int k, int kInChain, byte[] kmerChain, int offset, int length) {
+ if (k > kInChain) {
+ return null;
+ }
+ if (k == kInChain) {
+ return kmerChain.clone();
+ }
+ int byteNum = Kmer.getByteNumFromK(k);
+ byte[] kmer = new byte[byteNum];
+
+ int i = 1;
+ for (; i < kmer.length; i++) {
+ kmer[kmer.length - i] = kmerChain[offset + length - i];
+ }
+ int posInByteOfChain = (k % 4) << 1; // *2
+ if (posInByteOfChain == 0) {
+ kmer[0] = kmerChain[offset + length - i];
+ } else {
+ kmer[0] = (byte) (kmerChain[offset + length - i] & ((1 << posInByteOfChain) - 1));
+ }
+ return kmer;
+ }
+
+ /**
+ * Merge kmer with next neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAGCTA
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param nextCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer, this K of this Kmer is k+1
+ */
+ public static byte[] mergeKmerWithNextCode(int k, byte[] kmer, int offset, int length, byte nextCode) {
+ int byteNum = length;
+ if (k % 4 == 0) {
+ byteNum++;
+ }
+ byte[] mergedKmer = new byte[byteNum];
+ for (int i = 1; i <= length; i++) {
+ mergedKmer[mergedKmer.length - i] = kmer[offset + length - i];
+ }
+ if (mergedKmer.length > length) {
+ mergedKmer[0] = (byte) (nextCode & 0x3);
+ } else {
+ mergedKmer[0] = (byte) (kmer[offset] | ((nextCode & 0x3) << ((k % 4) << 1)));
+ }
+ return mergedKmer;
+ }
+
+ /**
+ * Merge kmer with previous neighbor in gene-code format.
+ * The k of new kmer will increase by 1
+ * e.g. AAGCT merge with A => AAAGCT
+ *
+ * @param k
+ * :input k of kmer
+ * @param kmer
+ * : input bytes of kmer
+ * @param preCode
+ * : next neighbor in gene-code format
+ * @return the merged Kmer,this K of this Kmer is k+1
+ */
+ public static byte[] mergeKmerWithPreCode(int k, byte[] kmer, int offset, int length, byte preCode) {
+ int byteNum = length;
+ byte[] mergedKmer = null;
+ int byteInMergedKmer = 0;
+ if (k % 4 == 0) {
+ byteNum++;
+ mergedKmer = new byte[byteNum];
+ mergedKmer[0] = (byte) ((kmer[offset] >> 6) & 0x3);
+ byteInMergedKmer++;
+ } else {
+ mergedKmer = new byte[byteNum];
+ }
+ for (int i = 0; i < length - 1; i++, byteInMergedKmer++) {
+ mergedKmer[byteInMergedKmer] = (byte) ((kmer[offset + i] << 2) | ((kmer[offset + i + 1] >> 6) & 0x3));
+ }
+ mergedKmer[byteInMergedKmer] = (byte) ((kmer[offset + length - 1] << 2) | (preCode & 0x3));
+ return mergedKmer;
+ }
+
+ /**
+ * Merge two kmer to one kmer
+ * e.g. ACTA + ACCGT => ACTAACCGT
+ *
+ * @param preK
+ * : previous k of kmer
+ * @param kmerPre
+ * : bytes array of previous kmer
+ * @param nextK
+ * : next k of kmer
+ * @param kmerNext
+ * : bytes array of next kmer
+ * @return merged kmer, the new k is @preK + @nextK
+ */
+ public static byte[] mergeTwoKmer(int preK, byte[] kmerPre, int offsetPre, int lengthPre, int nextK,
+ byte[] kmerNext, int offsetNext, int lengthNext) {
+ int byteNum = Kmer.getByteNumFromK(preK + nextK);
+ byte[] mergedKmer = new byte[byteNum];
+ int i = 1;
+ for (; i <= lengthPre; i++) {
+ mergedKmer[byteNum - i] = kmerPre[offsetPre + lengthPre - i];
+ }
+ if (i > 1) {
+ i--;
+ }
+ if (preK % 4 == 0) {
+ for (int j = 1; j <= lengthNext; j++) {
+ mergedKmer[byteNum - i - j] = kmerNext[offsetNext + lengthNext - j];
+ }
+ } else {
+ int posNeedToMove = ((preK % 4) << 1);
+ mergedKmer[byteNum - i] |= kmerNext[offsetNext + lengthNext - 1] << posNeedToMove;
+ for (int j = 1; j < lengthNext; j++) {
+ mergedKmer[byteNum - i - j] = (byte) (((kmerNext[offsetNext + lengthNext - j] & 0xff) >> (8 - posNeedToMove)) | (kmerNext[offsetNext
+ + lengthNext - j - 1] << posNeedToMove));
+ }
+ if ((nextK % 4) * 2 + posNeedToMove > 8) {
+ mergedKmer[0] = (byte) (kmerNext[offsetNext] >> (8 - posNeedToMove));
+ }
+ }
+ return mergedKmer;
+ }
+
+ /**
+ * Safely shifted the kmer forward without change the input kmer
+ * e.g. AGCGC shift with T => GCGCT
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param afterCode
+ * : input genecode
+ * @return new created kmer that shifted by afterCode, the K will not change
+ */
+ public static byte[] shiftKmerWithNextCode(int k, final byte[] kmer, int offset, int length, byte afterCode) {
+ byte[] shifted = Arrays.copyOfRange(kmer, offset, offset + length);
+ Kmer.moveKmer(k, shifted, Kmer.GENE_CODE.getSymbolFromCode(afterCode));
+ return shifted;
+ }
+
+ /**
+ * Safely shifted the kmer backward without change the input kmer
+ * e.g. AGCGC shift with T => TAGCG
+ *
+ * @param k
+ * : kmer length
+ * @param kmer
+ * : input kmer
+ * @param preCode
+ * : input genecode
+ * @return new created kmer that shifted by preCode, the K will not change
+ */
+ public static byte[] shiftKmerWithPreCode(int k, final byte[] kmer, int offset, int length, byte preCode) {
+ byte[] shifted = Arrays.copyOfRange(kmer, offset, offset + length);
+ Kmer.moveKmerReverse(k, shifted, Kmer.GENE_CODE.getSymbolFromCode(preCode));
+ return shifted;
+ }
+
+ public static byte getGeneCodeAtPosition(int pos, int k, final byte[] kmer, int offset, int length) {
+ if (pos >= k) {
+ return -1;
+ }
+ int posByte = pos / 4;
+ int shift = (pos % 4) << 1;
+ return (byte) ((kmer[offset + length - 1 - posByte] >> shift) & 0x3);
+ }
+}
diff --git a/genomix/genomix-data/src/main/resources/conf/cluster.properties b/genomix/genomix-data/src/main/resources/conf/cluster.properties
new file mode 100644
index 0000000..eabd81b
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/conf/cluster.properties
@@ -0,0 +1,40 @@
+#The CC port for Hyracks clients
+CC_CLIENTPORT=3099
+
+#The CC port for Hyracks cluster management
+CC_CLUSTERPORT=1099
+
+#The directory of hyracks binaries
+HYRACKS_HOME="../../../../hyracks"
+
+#The tmp directory for cc to install jars
+CCTMP_DIR=/tmp/t1
+
+#The tmp directory for nc to install jars
+NCTMP_DIR=/tmp/t2
+
+#The directory to put cc logs
+CCLOGS_DIR=$CCTMP_DIR/logs
+
+#The directory to put nc logs
+NCLOGS_DIR=$NCTMP_DIR/logs
+
+#Comma separated I/O directories for the spilling of external sort
+IO_DIRS="/tmp/t3,/tmp/t4"
+
+#The JAVA_HOME
+JAVA_HOME=$JAVA_HOME
+
+#HADOOP_HOME
+CLASSPATH="${HADOOP_HOME}:${CLASSPATH}:."
+
+#The frame size of the internal dataflow engine
+FRAME_SIZE=65536
+
+#CC JAVA_OPTS
+CCJAVA_OPTS="-Xdebug -Xrunjdwp:transport=dt_socket,address=7001,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+# Yourkit option: -agentpath:/grid/0/dev/vborkar/tools/yjp-10.0.4/bin/linux-x86-64/libyjpagent.so=port=20001"
+
+#NC JAVA_OPTS
+NCJAVA_OPTS="-Xdebug -Xrunjdwp:transport=dt_socket,address=7002,server=y,suspend=n -Xmx10g -Djava.util.logging.config.file=logging.properties"
+
diff --git a/genomix/genomix-data/src/main/resources/conf/debugnc.properties b/genomix/genomix-data/src/main/resources/conf/debugnc.properties
new file mode 100644
index 0000000..27afa26
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/conf/debugnc.properties
@@ -0,0 +1,12 @@
+#The tmp directory for nc to install jars
+NCTMP_DIR2=/tmp/t-1
+
+#The directory to put nc logs
+NCLOGS_DIR2=$NCTMP_DIR/logs
+
+#Comma separated I/O directories for the spilling of external sort
+IO_DIRS2="/tmp/t-2,/tmp/t-3"
+
+#NC JAVA_OPTS
+NCJAVA_OPTS2="-Xdebug -Xrunjdwp:transport=dt_socket,address=7003,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+
diff --git a/genomix/genomix-data/src/main/resources/conf/master b/genomix/genomix-data/src/main/resources/conf/master
new file mode 100644
index 0000000..2fbb50c
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/conf/master
@@ -0,0 +1 @@
+localhost
diff --git a/genomix/genomix-data/src/main/resources/conf/slaves b/genomix/genomix-data/src/main/resources/conf/slaves
new file mode 100644
index 0000000..2fbb50c
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/conf/slaves
@@ -0,0 +1 @@
+localhost
diff --git a/genomix/genomix-data/src/main/resources/scripts/genomix b/genomix/genomix-data/src/main/resources/scripts/genomix
new file mode 100644
index 0000000..bdd7f20
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/genomix
@@ -0,0 +1,113 @@
+#!/bin/sh
+# ----------------------------------------------------------------------------
+# Copyright 2001-2006 The Apache Software Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+#
+# Copyright (c) 2001-2006 The Apache Software Foundation. All rights
+# reserved.
+
+
+# resolve links - $0 may be a softlink
+PRG="$0"
+
+while [ -h "$PRG" ]; do
+ ls=`ls -ld "$PRG"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '/.*' > /dev/null; then
+ PRG="$link"
+ else
+ PRG=`dirname "$PRG"`/"$link"
+ fi
+done
+
+PRGDIR=`dirname "$PRG"`
+BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd`
+
+
+
+# OS specific support. $var _must_ be set to either true or false.
+cygwin=false;
+darwin=false;
+case "`uname`" in
+ CYGWIN*) cygwin=true ;;
+ Darwin*) darwin=true
+ if [ -z "$JAVA_VERSION" ] ; then
+ JAVA_VERSION="CurrentJDK"
+ else
+ echo "Using Java version: $JAVA_VERSION"
+ fi
+ if [ -z "$JAVA_HOME" ] ; then
+ JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home
+ fi
+ ;;
+esac
+
+if [ -z "$JAVA_HOME" ] ; then
+ if [ -r /etc/gentoo-release ] ; then
+ JAVA_HOME=`java-config --jre-home`
+ fi
+fi
+
+# For Cygwin, ensure paths are in UNIX format before anything is touched
+if $cygwin ; then
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
+ [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
+fi
+
+# If a specific java binary isn't specified search for the standard 'java' binary
+if [ -z "$JAVACMD" ] ; then
+ if [ -n "$JAVA_HOME" ] ; then
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+ # IBM's JDK on AIX uses strange locations for the executables
+ JAVACMD="$JAVA_HOME/jre/sh/java"
+ else
+ JAVACMD="$JAVA_HOME/bin/java"
+ fi
+ else
+ JAVACMD=`which java`
+ fi
+fi
+
+if [ ! -x "$JAVACMD" ] ; then
+ echo "Error: JAVA_HOME is not defined correctly." 1>&2
+ echo " We cannot execute $JAVACMD" 1>&2
+ exit 1
+fi
+
+if [ -z "$REPO" ]
+then
+ REPO="$BASEDIR"/lib
+fi
+
+CLASSPATH=$CLASSPATH_PREFIX:"$BASEDIR"/etc:"$REPO"/hyracks-dataflow-std-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-api-0.2.3-SNAPSHOT.jar:"$REPO"/json-20090211.jar:"$REPO"/httpclient-4.1-alpha2.jar:"$REPO"/httpcore-4.1-beta1.jar:"$REPO"/commons-logging-1.1.1.jar:"$REPO"/commons-codec-1.4.jar:"$REPO"/args4j-2.0.12.jar:"$REPO"/commons-lang3-3.1.jar:"$REPO"/hyracks-dataflow-common-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-data-std-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-control-cc-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-control-common-0.2.3-SNAPSHOT.jar:"$REPO"/jetty-server-8.0.0.RC0.jar:"$REPO"/servlet-api-3.0.20100224.jar:"$REPO"/jetty-continuation-8.0.0.RC0.jar:"$REPO"/jetty-http-8.0.0.RC0.jar:"$REPO"/jetty-io-8.0.0.RC0.jar:"$REPO"/jetty-webapp-8.0.0.RC0.jar:"$REPO"/jetty-xml-8.0.0.RC0.jar:"$REPO"/jetty-util-8.0.0.RC0.jar:"$REPO"/jetty-servlet-8.0.0.RC0.jar:"$REPO"/jetty-security-8.0.0.RC0.jar:"$REPO"/wicket-core-1.5.2.jar:"$REPO"/wicket-util-1.5.2.jar:"$REPO"/wicket-request-1.5.2.jar:"$REPO"/slf4j-api-1.6.1.jar:"$REPO"/slf4j-jcl-1.6.3.jar:"$REPO"/hyracks-control-nc-0.2.3-SNAPSHOT.jar:"$REPO"/dcache-client-0.0.1.jar:"$REPO"/jetty-client-8.0.0.M0.jar:"$REPO"/hyracks-net-0.2.3-SNAPSHOT.jar:"$REPO"/commons-io-1.3.1.jar:"$REPO"/hyracks-ipc-0.2.3-SNAPSHOT.jar:"$REPO"/genomix-0.2.3-SNAPSHOT.pom
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin; then
+ [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
+ [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"`
+ [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"`
+ [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"`
+fi
+
+exec "$JAVACMD" $JAVA_OPTS \
+ -classpath "$CLASSPATH" \
+ -Dapp.name="genomix" \
+ -Dapp.pid="$$" \
+ -Dapp.repo="$REPO" \
+ -Dapp.home="$BASEDIR" \
+ -Dbasedir="$BASEDIR" \
+ edu.uci.ics.genomix.driver.Driver \
+ "$@"
diff --git a/genomix/genomix-data/src/main/resources/scripts/genomix.bat b/genomix/genomix-data/src/main/resources/scripts/genomix.bat
new file mode 100644
index 0000000..1bd2098
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/genomix.bat
@@ -0,0 +1,108 @@
+@REM ----------------------------------------------------------------------------
+@REM Copyright 2001-2006 The Apache Software Foundation.
+@REM
+@REM Licensed under the Apache License, Version 2.0 (the "License");
+@REM you may not use this file except in compliance with the License.
+@REM You may obtain a copy of the License at
+@REM
+@REM http://www.apache.org/licenses/LICENSE-2.0
+@REM
+@REM Unless required by applicable law or agreed to in writing, software
+@REM distributed under the License is distributed on an "AS IS" BASIS,
+@REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@REM See the License for the specific language governing permissions and
+@REM limitations under the License.
+@REM ----------------------------------------------------------------------------
+@REM
+@REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights
+@REM reserved.
+
+@echo off
+
+set ERROR_CODE=0
+
+:init
+@REM Decide how to startup depending on the version of windows
+
+@REM -- Win98ME
+if NOT "%OS%"=="Windows_NT" goto Win9xArg
+
+@REM set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" @setlocal
+
+@REM -- 4NT shell
+if "%eval[2+2]" == "4" goto 4NTArgs
+
+@REM -- Regular WinNT shell
+set CMD_LINE_ARGS=%*
+goto WinNTGetScriptDir
+
+@REM The 4NT Shell from jp software
+:4NTArgs
+set CMD_LINE_ARGS=%$
+goto WinNTGetScriptDir
+
+:Win9xArg
+@REM Slurp the command line arguments. This loop allows for an unlimited number
+@REM of arguments (up to the command line limit, anyway).
+set CMD_LINE_ARGS=
+:Win9xApp
+if %1a==a goto Win9xGetScriptDir
+set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1
+shift
+goto Win9xApp
+
+:Win9xGetScriptDir
+set SAVEDIR=%CD%
+%0\
+cd %0\..\..
+set BASEDIR=%CD%
+cd %SAVEDIR%
+set SAVE_DIR=
+goto repoSetup
+
+:WinNTGetScriptDir
+set BASEDIR=%~dp0\..
+
+:repoSetup
+
+
+if "%JAVACMD%"=="" set JAVACMD=java
+
+if "%REPO%"=="" set REPO=%BASEDIR%\lib
+
+set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\hyracks-dataflow-std-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-api-0.2.3-SNAPSHOT.jar;"%REPO%"\json-20090211.jar;"%REPO%"\httpclient-4.1-alpha2.jar;"%REPO%"\httpcore-4.1-beta1.jar;"%REPO%"\commons-logging-1.1.1.jar;"%REPO%"\commons-codec-1.4.jar;"%REPO%"\args4j-2.0.12.jar;"%REPO%"\commons-lang3-3.1.jar;"%REPO%"\hyracks-dataflow-common-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-data-std-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-control-cc-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-control-common-0.2.3-SNAPSHOT.jar;"%REPO%"\jetty-server-8.0.0.RC0.jar;"%REPO%"\servlet-api-3.0.20100224.jar;"%REPO%"\jetty-continuation-8.0.0.RC0.jar;"%REPO%"\jetty-http-8.0.0.RC0.jar;"%REPO%"\jetty-io-8.0.0.RC0.jar;"%REPO%"\jetty-webapp-8.0.0.RC0.jar;"%REPO%"\jetty-xml-8.0.0.RC0.jar;"%REPO%"\jetty-util-8.0.0.RC0.jar;"%REPO%"\jetty-servlet-8.0.0.RC0.jar;"%REPO%"\jetty-security-8.0.0.RC0.jar;"%REPO%"\wicket-core-1.5.2.jar;"%REPO%"\wicket-util-1.5.2.jar;"%REPO%"\wicket-request-1.5.2.jar;"%REPO%"\slf4j-api-1.6.1.jar;"%REPO%"\slf4j-jcl-1.6.3.jar;"%REPO%"\hyracks-control-nc-0.2.3-SNAPSHOT.jar;"%REPO%"\dcache-client-0.0.1.jar;"%REPO%"\jetty-client-8.0.0.M0.jar;"%REPO%"\hyracks-net-0.2.3-SNAPSHOT.jar;"%REPO%"\commons-io-1.3.1.jar;"%REPO%"\hyracks-ipc-0.2.3-SNAPSHOT.jar;"%REPO%"\genomix-0.2.3-SNAPSHOT.pom
+goto endInit
+
+@REM Reaching here means variables are defined and arguments have been captured
+:endInit
+
+%JAVACMD% %JAVA_OPTS% -classpath %CLASSPATH_PREFIX%;%CLASSPATH% -Dapp.name="genomix" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.uci.ics.genomix.driver.Driver %CMD_LINE_ARGS%
+if ERRORLEVEL 1 goto error
+goto end
+
+:error
+if "%OS%"=="Windows_NT" @endlocal
+set ERROR_CODE=%ERRORLEVEL%
+
+:end
+@REM set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" goto endNT
+
+@REM For old DOS remove the set variables from ENV - we assume they were not set
+@REM before we started - at least we don't leave any baggage around
+set CMD_LINE_ARGS=
+goto postExec
+
+:endNT
+@REM If error code is set to 1 then the endlocal was done already in :error.
+if %ERROR_CODE% EQU 0 @endlocal
+
+
+:postExec
+
+if "%FORCE_EXIT_ON_ERROR%" == "on" (
+ if %ERROR_CODE% NEQ 0 exit %ERROR_CODE%
+)
+
+exit /B %ERROR_CODE%
diff --git a/genomix/genomix-data/src/main/resources/scripts/getip.sh b/genomix/genomix-data/src/main/resources/scripts/getip.sh
new file mode 100644
index 0000000..e0cdf73
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/getip.sh
@@ -0,0 +1,21 @@
+#get the OS
+OS_NAME=`uname -a|awk '{print $1}'`
+LINUX_OS='Linux'
+
+if [ $OS_NAME = $LINUX_OS ];
+then
+ #Get IP Address
+ IPADDR=`/sbin/ifconfig eth0 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ if [ "$IPADDR" = "" ]
+ then
+ IPADDR=`/sbin/ifconfig lo | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ fi
+else
+ IPADDR=`/sbin/ifconfig en1 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ if [ "$IPADDR" = "" ]
+ then
+ IPADDR=`/sbin/ifconfig lo0 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ fi
+
+fi
+echo $IPADDR
diff --git a/genomix/genomix-data/src/main/resources/scripts/startAllNCs.sh b/genomix/genomix-data/src/main/resources/scripts/startAllNCs.sh
new file mode 100644
index 0000000..5e38c40
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/startAllNCs.sh
@@ -0,0 +1,6 @@
+GENOMIX_PATH=`pwd`
+
+for i in `cat conf/slaves`
+do
+ ssh $i "cd ${GENOMIX_PATH}; bin/startnc.sh"
+done
diff --git a/genomix/genomix-data/src/main/resources/scripts/startCluster.sh b/genomix/genomix-data/src/main/resources/scripts/startCluster.sh
new file mode 100755
index 0000000..4727764
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/startCluster.sh
@@ -0,0 +1,19 @@
+bin/startcc.sh
+sleep 5
+bin/startAllNCs.sh
+
+. conf/cluster.properties
+# do we need to specify the version somewhere?
+hyrackcmd=`ls ${HYRACKS_HOME}/hyracks-cli/target/hyracks-cli-*-binary-assembly/bin/hyrackscli`
+# find zip file
+appzip=`ls $PWD/../genomix-*-binary-assembly.zip`
+
+[ -f $hyrackcmd ] || { echo "Hyracks commandline is missing"; exit -1;}
+[ -f $appzip ] || { echo "Genomix binary-assembly.zip is missing"; exit -1;}
+
+CCHOST_NAME=`cat conf/master`
+
+IPADDR=`bin/getip.sh`
+echo "connect to \"${IPADDR}:${CC_CLIENTPORT}\"; create application genomix \"$appzip\";" | $hyrackcmd
+echo ""
+
diff --git a/genomix/genomix-data/src/main/resources/scripts/startDebugNc.sh b/genomix/genomix-data/src/main/resources/scripts/startDebugNc.sh
new file mode 100644
index 0000000..fe6cf27
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/startDebugNc.sh
@@ -0,0 +1,50 @@
+hostname
+
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CURRENT_PATH=`pwd`
+CCHOST=`ssh ${CCHOST_NAME} "cd ${CURRENT_PATH}; bin/getip.sh"`
+
+#Import cluster properties
+. conf/cluster.properties
+. conf/debugnc.properties
+
+#Clean up temp dir
+
+rm -rf $NCTMP_DIR2
+mkdir $NCTMP_DIR2
+
+#Clean up log dir
+rm -rf $NCLOGS_DIR2
+mkdir $NCLOGS_DIR2
+
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS2 | tr "," "\n")
+for io_dir in $io_dirs
+do
+ rm -rf $io_dir
+ mkdir $io_dir
+done
+
+#Set JAVA_HOME
+export JAVA_HOME=$JAVA_HOME
+
+#Get OS
+IPADDR=`bin/getip.sh`
+
+#Get node ID
+NODEID=`hostname | cut -d '.' -f 1`
+NODEID=${NODEID}2
+
+#Set JAVA_OPTS
+export JAVA_OPTS=$NCJAVA_OPTS2
+
+cd $HYRACKS_HOME
+HYRACKS_HOME=`pwd`
+
+#Enter the temp dir
+cd $NCTMP_DIR2
+
+#Launch hyracks nc
+$HYRACKS_HOME/hyracks-server/target/appassembler/bin/hyracksnc -cc-host $CCHOST -cc-port $CC_CLUSTERPORT -cluster-net-ip-address $IPADDR -data-ip-address $IPADDR -node-id $NODEID -iodevices "${IO_DIRS2}" &> $NCLOGS_DIR2/$NODEID.log &
diff --git a/genomix/genomix-data/src/main/resources/scripts/startcc.sh b/genomix/genomix-data/src/main/resources/scripts/startcc.sh
new file mode 100644
index 0000000..fe2551d
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/startcc.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+hostname
+
+#Import cluster properties
+. conf/cluster.properties
+
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CCHOST=`bin/getip.sh`
+
+#Remove the temp dir
+rm -rf $CCTMP_DIR
+mkdir $CCTMP_DIR
+
+#Remove the logs dir
+rm -rf $CCLOGS_DIR
+mkdir $CCLOGS_DIR
+
+#Export JAVA_HOME and JAVA_OPTS
+export JAVA_HOME=$JAVA_HOME
+export JAVA_OPTS=$CCJAVA_OPTS
+
+#Launch hyracks cc script
+chmod -R 755 $HYRACKS_HOME
+$HYRACKS_HOME/hyracks-server/target/appassembler/bin/hyrackscc -client-net-ip-address $CCHOST -cluster-net-ip-address $CCHOST -client-net-port $CC_CLIENTPORT -cluster-net-port $CC_CLUSTERPORT -max-heartbeat-lapse-periods 999999 -default-max-job-attempts 0 -job-history-size 3 &> $CCLOGS_DIR/cc.log &
diff --git a/genomix/genomix-data/src/main/resources/scripts/startnc.sh b/genomix/genomix-data/src/main/resources/scripts/startnc.sh
new file mode 100644
index 0000000..6e0f90e
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/startnc.sh
@@ -0,0 +1,49 @@
+hostname
+
+MY_NAME=`hostname`
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CURRENT_PATH=`pwd`
+CCHOST=`ssh ${CCHOST_NAME} "cd ${CURRENT_PATH}; bin/getip.sh"`
+
+#Import cluster properties
+. conf/cluster.properties
+
+#Clean up temp dir
+
+rm -rf $NCTMP_DIR
+mkdir $NCTMP_DIR
+
+#Clean up log dir
+rm -rf $NCLOGS_DIR
+mkdir $NCLOGS_DIR
+
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS | tr "," "\n")
+for io_dir in $io_dirs
+do
+ rm -rf $io_dir
+ mkdir $io_dir
+done
+
+#Set JAVA_HOME
+export JAVA_HOME=$JAVA_HOME
+
+IPADDR=`bin/getip.sh`
+#echo $IPADDR
+
+#Get node ID
+NODEID=`hostname | cut -d '.' -f 1`
+
+#Set JAVA_OPTS
+export JAVA_OPTS=$NCJAVA_OPTS
+
+cd $HYRACKS_HOME
+HYRACKS_HOME=`pwd`
+
+#Enter the temp dir
+cd $NCTMP_DIR
+
+#Launch hyracks nc
+$HYRACKS_HOME/hyracks-server/target/appassembler/bin/hyracksnc -cc-host $CCHOST -cc-port $CC_CLUSTERPORT -cluster-net-ip-address $IPADDR -data-ip-address $IPADDR -node-id $NODEID -iodevices "${IO_DIRS}" &> $NCLOGS_DIR/$NODEID.log &
diff --git a/genomix/genomix-data/src/main/resources/scripts/stopAllNCs.sh b/genomix/genomix-data/src/main/resources/scripts/stopAllNCs.sh
new file mode 100644
index 0000000..66ed866
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/stopAllNCs.sh
@@ -0,0 +1,6 @@
+GENOMIX_PATH=`pwd`
+
+for i in `cat conf/slaves`
+do
+ ssh $i "cd ${GENOMIX_PATH}; bin/stopnc.sh"
+done
diff --git a/genomix/genomix-data/src/main/resources/scripts/stopCluster.sh b/genomix/genomix-data/src/main/resources/scripts/stopCluster.sh
new file mode 100644
index 0000000..4889934
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/stopCluster.sh
@@ -0,0 +1,3 @@
+bin/stopAllNCs.sh
+sleep 2
+bin/stopcc.sh
diff --git a/genomix/genomix-data/src/main/resources/scripts/stopcc.sh b/genomix/genomix-data/src/main/resources/scripts/stopcc.sh
new file mode 100644
index 0000000..1865054
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/stopcc.sh
@@ -0,0 +1,10 @@
+hostname
+. conf/cluster.properties
+
+#Kill process
+PID=`ps -ef|grep ${USER}|grep java|grep hyracks|awk '{print $2}'`
+echo $PID
+[ "$PID" != "" ] && kill -9 $PID
+
+#Clean up CC temp dir
+rm -rf $CCTMP_DIR/*
diff --git a/genomix/genomix-data/src/main/resources/scripts/stopnc.sh b/genomix/genomix-data/src/main/resources/scripts/stopnc.sh
new file mode 100644
index 0000000..3928bb7
--- /dev/null
+++ b/genomix/genomix-data/src/main/resources/scripts/stopnc.sh
@@ -0,0 +1,23 @@
+hostname
+. conf/cluster.properties
+
+#Kill process
+PID=`ps -ef|grep ${USER}|grep java|grep 'Dapp.name=hyracksnc'|awk '{print $2}'`
+
+if [ "$PID" == "" ]; then
+ USERID=`id | sed 's/^uid=//;s/(.*$//'`
+ PID=`ps -ef|grep ${USERID}|grep java|grep 'Dapp.name=hyracksnc'|awk '{print $2}'`
+fi
+
+echo $PID
+[ "$PID" != "" ] && kill -9 $PID
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS | tr "," "\n")
+for io_dir in $io_dirs
+do
+ rm -rf $io_dir/*
+done
+
+#Clean up NC temp dir
+rm -rf $NCTMP_DIR/*
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
new file mode 100644
index 0000000..faee509
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/KmerBytesWritableTest.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.example.kmer;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+public class KmerBytesWritableTest {
+ static byte[] array = { 'A', 'A', 'T', 'A', 'G', 'A', 'A', 'G' };
+ static int k = 7;
+
+ @Test
+ public void TestCompressKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByRead(array, 1);
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestMoveKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithNextCode(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithNextChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "ATAGAAG");
+ }
+
+ @Test
+ public void TestCompressKmerReverse() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ kmer.setByReadReverse(array, 1);
+ Assert.assertEquals(kmer.toString(), "GAAGATA");
+ }
+
+ @Test
+ public void TestMoveKmerReverse() {
+ KmerBytesWritable kmer = new KmerBytesWritable(k);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AATAGAA");
+
+ for (int i = k; i < array.length - 1; i++) {
+ kmer.shiftKmerWithPreChar(array[i]);
+ Assert.assertTrue(false);
+ }
+
+ byte out = kmer.shiftKmerWithPreChar(array[array.length - 1]);
+ Assert.assertEquals(out, GeneCode.getCodeFromSymbol((byte) 'A'));
+ Assert.assertEquals(kmer.toString(), "GAATAGA");
+ }
+
+ @Test
+ public void TestGetGene() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ String text = "AGCTGACCG";
+ byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G' };
+ kmer.setByRead(array, 0);
+
+ for (int i = 0; i < 9; i++) {
+ Assert.assertEquals(text.charAt(i), (char) (GeneCode.getSymbolFromCode(kmer.getGeneCodeAtPosition(i))));
+ }
+ }
+
+}
diff --git a/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
new file mode 100644
index 0000000..7c4c675
--- /dev/null
+++ b/genomix/genomix-data/src/test/java/edu/uci/ics/genomix/example/kmer/VKmerBytesWritableFactoryTest.java
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.example.kmer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+public class VKmerBytesWritableFactoryTest {
+ static byte[] array = { 'A', 'G', 'C', 'T', 'G', 'A', 'C', 'C', 'G', 'T' };
+
+ VKmerBytesWritableFactory kmerFactory = new VKmerBytesWritableFactory(8);
+
+ @Test
+ public void TestDegree() {
+ Assert.assertTrue(GeneCode.inDegree((byte) 0xff) == 4);
+ Assert.assertTrue(GeneCode.outDegree((byte) 0xff) == 4);
+ Assert.assertTrue(GeneCode.inDegree((byte) 0x3f) == 2);
+ Assert.assertTrue(GeneCode.outDegree((byte) 0x01) == 1);
+ Assert.assertTrue(GeneCode.inDegree((byte) 0x01) == 0);
+ }
+
+ @Test
+ public void TestGetLastKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ KmerBytesWritable lastKmer;
+ for (int i = 8; i > 0; i--) {
+ lastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
+ lastKmer = kmerFactory.getSubKmerFromChain(9 - i, i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), lastKmer.toString());
+ }
+ VKmerBytesWritable vlastKmer;
+ for (int i = 8; i > 0; i--) {
+ vlastKmer = kmerFactory.getLastKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
+ vlastKmer = kmerFactory.getSubKmerFromChain(9 - i, i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(9 - i), vlastKmer.toString());
+ }
+ }
+
+ @Test
+ public void TestGetFirstKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ KmerBytesWritable firstKmer;
+ for (int i = 8; i > 0; i--) {
+ firstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0, i), firstKmer.toString());
+ firstKmer = kmerFactory.getSubKmerFromChain(0, i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0, i), firstKmer.toString());
+ }
+ VKmerBytesWritable vfirstKmer;
+ for (int i = 8; i > 0; i--) {
+ vfirstKmer = kmerFactory.getFirstKmerFromChain(i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0, i), vfirstKmer.toString());
+ vfirstKmer = kmerFactory.getSubKmerFromChain(0, i, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(0, i), vfirstKmer.toString());
+ }
+ }
+
+ @Test
+ public void TestGetSubKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ VKmerBytesWritable subKmer;
+ for (int istart = 0; istart < kmer.getKmerLength() - 1; istart++) {
+ for (int isize = 1; isize + istart <= kmer.getKmerLength(); isize++) {
+ subKmer = kmerFactory.getSubKmerFromChain(istart, isize, kmer);
+ Assert.assertEquals("AGCTGACCG".substring(istart, istart + isize), subKmer.toString());
+ }
+ }
+ }
+
+ @Test
+ public void TestMergeNext() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+
+ String text = "AGCTGACCG";
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ text = text + (char) GeneCode.GENE_SYMBOL[x];
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithNextCode(kmer, x);
+ text = text + (char) GeneCode.GENE_SYMBOL[x];
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ }
+
+ @Test
+ public void TestMergePre() {
+ KmerBytesWritable kmer = new KmerBytesWritable(9);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals("AGCTGACCG", kmer.toString());
+ String text = "AGCTGACCG";
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ text = (char) GeneCode.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ for (byte x = GeneCode.A; x <= GeneCode.T; x++) {
+ KmerBytesWritable newkmer = kmerFactory.mergeKmerWithPreCode(kmer, x);
+ text = (char) GeneCode.GENE_SYMBOL[x] + text;
+ Assert.assertEquals(text, newkmer.toString());
+ kmer = new KmerBytesWritable(newkmer);
+ }
+ }
+
+ @Test
+ public void TestMergeTwoKmer() {
+ KmerBytesWritable kmer1 = new KmerBytesWritable(9);
+ kmer1.setByRead(array, 0);
+ String text1 = "AGCTGACCG";
+ KmerBytesWritable kmer2 = new KmerBytesWritable(9);
+ kmer2.setByRead(array, 1);
+ String text2 = "GCTGACCGT";
+ Assert.assertEquals(text1, kmer1.toString());
+ Assert.assertEquals(text2, kmer2.toString());
+
+ KmerBytesWritable merged = kmerFactory.mergeTwoKmer(kmer1, kmer2);
+ Assert.assertEquals(text1 + text2, merged.toString());
+
+ KmerBytesWritable kmer3 = new KmerBytesWritable(3);
+ kmer3.setByRead(array, 1);
+ String text3 = "GCT";
+ Assert.assertEquals(text3, kmer3.toString());
+
+ merged = kmerFactory.mergeTwoKmer(kmer1, kmer3);
+ Assert.assertEquals(text1 + text3, merged.toString());
+ merged = kmerFactory.mergeTwoKmer(kmer3, kmer1);
+ Assert.assertEquals(text3 + text1, merged.toString());
+
+ KmerBytesWritable kmer4 = new KmerBytesWritable(8);
+ kmer4.setByRead(array, 0);
+ String text4 = "AGCTGACC";
+ Assert.assertEquals(text4, kmer4.toString());
+ merged = kmerFactory.mergeTwoKmer(kmer4, kmer3);
+ Assert.assertEquals(text4 + text3, merged.toString());
+
+ KmerBytesWritable kmer5 = new KmerBytesWritable(7);
+ kmer5.setByRead(array, 0);
+ String text5 = "AGCTGAC";
+ VKmerBytesWritable kmer6 = new VKmerBytesWritable(9);
+ kmer6.setByRead(9, array, 1);
+ String text6 = "GCTGACCGT";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5 + text6, merged.toString());
+
+ kmer6.setByRead(6, array, 1);
+ String text7 = "GCTGAC";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5 + text7, merged.toString());
+
+ kmer6.setByRead(4, array, 1);
+ String text8 = "GCTG";
+ merged = kmerFactory.mergeTwoKmer(kmer5, kmer6);
+ Assert.assertEquals(text5 + text8, merged.toString());
+
+ }
+
+ @Test
+ public void TestShift() {
+ VKmerBytesWritable kmer = new VKmerBytesWritable(kmerFactory.getKmerByRead(9, array, 0));
+ String text = "AGCTGACCG";
+ Assert.assertEquals(text, kmer.toString());
+
+ VKmerBytesWritable kmerForward = kmerFactory.shiftKmerWithNextCode(kmer, GeneCode.A);
+ Assert.assertEquals(text, kmer.toString());
+ Assert.assertEquals("GCTGACCGA", kmerForward.toString());
+ VKmerBytesWritable kmerBackward = kmerFactory.shiftKmerWithPreCode(kmer, GeneCode.C);
+ Assert.assertEquals(text, kmer.toString());
+ Assert.assertEquals("CAGCTGACC", kmerBackward.toString());
+
+ }
+
+ @Test
+ public void TestReverseKmer() {
+ KmerBytesWritable kmer = new KmerBytesWritable(7);
+ kmer.setByRead(array, 0);
+ Assert.assertEquals(kmer.toString(), "AGCTGAC");
+ KmerBytesWritable reversed = kmerFactory.reverse(kmer);
+ Assert.assertEquals(reversed.toString(), "CAGTCGA");
+ }
+}
diff --git a/genomix/genomix-data/src/test/resources/data/0/text.txt b/genomix/genomix-data/src/test/resources/data/0/text.txt
new file mode 100755
index 0000000..f63a141
--- /dev/null
+++ b/genomix/genomix-data/src/test/resources/data/0/text.txt
@@ -0,0 +1,4 @@
+@625E1AAXX100810:1:100:10000:10271/1
+AATAGAAG
++
+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
diff --git a/genomix/genomix-data/src/test/resources/data/webmap/text.txt b/genomix/genomix-data/src/test/resources/data/webmap/text.txt
new file mode 100755
index 0000000..f63a141
--- /dev/null
+++ b/genomix/genomix-data/src/test/resources/data/webmap/text.txt
@@ -0,0 +1,4 @@
+@625E1AAXX100810:1:100:10000:10271/1
+AATAGAAG
++
+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
diff --git a/genomix/genomix-data/src/test/resources/expected/result2 b/genomix/genomix-data/src/test/resources/expected/result2
new file mode 100755
index 0000000..5e76458
--- /dev/null
+++ b/genomix/genomix-data/src/test/resources/expected/result2
@@ -0,0 +1,4 @@
+AATAG |A 1
+AGAAG T| 1
+ATAGA A|A 1
+TAGAA A|G 1
diff --git a/genomix/genomix-data/src/test/resources/hadoop/conf/core-site.xml b/genomix/genomix-data/src/test/resources/hadoop/conf/core-site.xml
new file mode 100644
index 0000000..3e5bacb
--- /dev/null
+++ b/genomix/genomix-data/src/test/resources/hadoop/conf/core-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+ <property>
+ <name>fs.default.name</name>
+ <value>hdfs://127.0.0.1:31888</value>
+ </property>
+ <property>
+ <name>hadoop.tmp.dir</name>
+ <value>/tmp/hadoop</value>
+ </property>
+
+
+</configuration>
diff --git a/genomix/genomix-data/src/test/resources/hadoop/conf/hdfs-site.xml b/genomix/genomix-data/src/test/resources/hadoop/conf/hdfs-site.xml
new file mode 100644
index 0000000..b1b1902
--- /dev/null
+++ b/genomix/genomix-data/src/test/resources/hadoop/conf/hdfs-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+ <property>
+ <name>dfs.replication</name>
+ <value>1</value>
+ </property>
+
+ <property>
+ <name>dfs.block.size</name>
+ <value>65536</value>
+ </property>
+
+</configuration>
diff --git a/genomix/genomix-data/src/test/resources/hadoop/conf/log4j.properties b/genomix/genomix-data/src/test/resources/hadoop/conf/log4j.properties
new file mode 100755
index 0000000..d5e6004
--- /dev/null
+++ b/genomix/genomix-data/src/test/resources/hadoop/conf/log4j.properties
@@ -0,0 +1,94 @@
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=FATAL,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+
+# Logging Threshold
+log4j.threshhold=FATAL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Rolling File Appender
+#
+
+#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
+#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Logfile size and and 30-day backups
+#log4j.appender.RFA.MaxFileSize=1MB
+#log4j.appender.RFA.MaxBackupIndex=30
+
+#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+#
+# FSNamesystem Audit logging
+# All audit events are logged at INFO level
+#
+log4j.logger.org.apache.hadoop.fs.FSNamesystem.audit=WARN
+
+# Custom Logging levels
+
+#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
+#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
+#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG
+
+# Jets3t library
+log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
diff --git a/genomix/genomix-data/src/test/resources/hadoop/conf/mapred-site.xml b/genomix/genomix-data/src/test/resources/hadoop/conf/mapred-site.xml
new file mode 100644
index 0000000..525e7d5
--- /dev/null
+++ b/genomix/genomix-data/src/test/resources/hadoop/conf/mapred-site.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+ <property>
+ <name>mapred.job.tracker</name>
+ <value>localhost:29007</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.map.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.reduce.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.max.split.size</name>
+ <value>2048</value>
+ </property>
+
+</configuration>
diff --git a/genomix/genomix-hadoop/data/.DS_Store b/genomix/genomix-hadoop/data/.DS_Store
new file mode 100644
index 0000000..07f4ca0
--- /dev/null
+++ b/genomix/genomix-hadoop/data/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/data/webmap/.DS_Store b/genomix/genomix-hadoop/data/webmap/.DS_Store
new file mode 100644
index 0000000..17172c4
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/data/webmap/BridgePath b/genomix/genomix-hadoop/data/webmap/BridgePath
new file mode 100644
index 0000000..0717611
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/BridgePath
@@ -0,0 +1,2 @@
+TTTCCACTCCGTG
+TTTCCACCCCGTG
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/data/webmap/CyclePath b/genomix/genomix-hadoop/data/webmap/CyclePath
new file mode 100644
index 0000000..04080f4
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/CyclePath
@@ -0,0 +1 @@
+GCAACTTCATCAACT
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/data/webmap/SimplePath b/genomix/genomix-hadoop/data/webmap/SimplePath
new file mode 100644
index 0000000..80c03af
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/SimplePath
@@ -0,0 +1,3 @@
+ATATCGCATC
+AAGACAGCAC
+GCGGCAAGAA
diff --git a/genomix/genomix-hadoop/data/webmap/SinglePath b/genomix/genomix-hadoop/data/webmap/SinglePath
new file mode 100644
index 0000000..56ef5f8
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/SinglePath
@@ -0,0 +1 @@
+AGACAACAGT
diff --git a/genomix/genomix-hadoop/data/webmap/Test.txt b/genomix/genomix-hadoop/data/webmap/Test.txt
new file mode 100755
index 0000000..6d02b25
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/Test.txt
@@ -0,0 +1,10 @@
+@625E1AAXX100810:1:100:10000:10271/1
+AGCATCGCA
++
+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
+@625E1AAXX100810:1:100:10000:10271/1
+TGCATCGCT
++
+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
+
+
diff --git a/genomix/genomix-hadoop/data/webmap/TreePath b/genomix/genomix-hadoop/data/webmap/TreePath
new file mode 100644
index 0000000..f3c13ce
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/TreePath
@@ -0,0 +1,3 @@
+GGCCTGGCTATCCC
+GGCCTCAGTAACTAAAC
+GGCCTCAGTACGCCCGG
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/data/webmap/simplePath~ b/genomix/genomix-hadoop/data/webmap/simplePath~
new file mode 100644
index 0000000..6615392
--- /dev/null
+++ b/genomix/genomix-hadoop/data/webmap/simplePath~
@@ -0,0 +1 @@
+AGCATGCTATAT
diff --git a/genomix/genomix-hadoop/expected/result1 b/genomix/genomix-hadoop/expected/result1
new file mode 100644
index 0000000..ba52008
--- /dev/null
+++ b/genomix/genomix-hadoop/expected/result1
@@ -0,0 +1,8 @@
+GCA ACT|T 3
+AGC |A 1
+CGC T|AT 2
+TGC |A 1
+ATC C|G 2
+TCG A|C 2
+CAT G|C 2
+GCT C| 1
diff --git a/genomix/genomix-hadoop/expected/result2 b/genomix/genomix-hadoop/expected/result2
new file mode 100755
index 0000000..db55a38
--- /dev/null
+++ b/genomix/genomix-hadoop/expected/result2
@@ -0,0 +1,8 @@
+GCA -72
+AGC 1
+CGC -119
+TGC 1
+ATC 36
+TCG 18
+CAT 66
+GCT 32
diff --git a/genomix/genomix-hadoop/expected/result3 b/genomix/genomix-hadoop/expected/result3
new file mode 100644
index 0000000..5f9dd78
--- /dev/null
+++ b/genomix/genomix-hadoop/expected/result3
@@ -0,0 +1 @@
+02 71 66 1
diff --git a/genomix/genomix-hadoop/pom.xml b/genomix/genomix-hadoop/pom.xml
new file mode 100755
index 0000000..fe83c3c
--- /dev/null
+++ b/genomix/genomix-hadoop/pom.xml
@@ -0,0 +1,160 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <artifactId>genomix-hadoop</artifactId>
+ <name>genomix-hadoop</name>
+
+ <parent>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>genomix</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ </parent>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.0.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ <fork>true</fork>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ <executions>
+ <execution>
+ <id>make-my-jar-with-dependencies</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>appassembler-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <configuration>
+ <programs>
+ <program>
+ <mainClass>edu.uci.ics.maxclique.Driver</mainClass>
+ <name>maxclique</name>
+ </program>
+ </programs>
+ <repositoryLayout>flat</repositoryLayout>
+ <repositoryName>lib</repositoryName>
+ </configuration>
+ <phase>package</phase>
+ <goals>
+ <goal>assemble</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.7.2</version>
+ <configuration>
+ <forkMode>pertest</forkMode>
+ <argLine>-enableassertions -Xmx512m -XX:MaxPermSize=300m
+ -Dfile.encoding=UTF-8
+ -Djava.util.logging.config.file=src/test/resources/logging.properties</argLine>
+ <includes>
+ <include>**/*TestSuite.java</include>
+ <include>**/*Test.java</include>
+ </includes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-clean-plugin</artifactId>
+ <configuration>
+ <filesets>
+ <fileset>
+ <directory>.</directory>
+ <includes>
+ <include>teststore*</include>
+ <include>edu*</include>
+ <include>build*</include>
+ <include>log*</include>
+ <include>ClusterController*</include>
+ </includes>
+ </fileset>
+ </filesets>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-core</artifactId>
+ <version>0.20.2</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-test</artifactId>
+ <version>0.20.2</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.kenai.nbpwr</groupId>
+ <artifactId>org-apache-commons-io</artifactId>
+ <version>1.3.1-201002241208</version>
+ <type>nbm</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jcl</artifactId>
+ <version>1.6.3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.6.3</version>
+ </dependency>
+ <dependency>
+ <groupId>args4j</groupId>
+ <artifactId>args4j</artifactId>
+ <version>2.0.16</version>
+ </dependency>
+ <dependency>
+ <groupId>com.kenai.nbpwr</groupId>
+ <artifactId>org-apache-commons-io</artifactId>
+ <version>1.3.1-201002241208</version>
+ <type>nbm</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>genomix-data</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/genomix/genomix-hadoop/src/.DS_Store b/genomix/genomix-hadoop/src/.DS_Store
new file mode 100644
index 0000000..e0bf627
--- /dev/null
+++ b/genomix/genomix-hadoop/src/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/.DS_Store b/genomix/genomix-hadoop/src/main/.DS_Store
new file mode 100644
index 0000000..325c6de
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/.DS_Store b/genomix/genomix-hadoop/src/main/java/.DS_Store
new file mode 100644
index 0000000..dd6c872
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/edu/.DS_Store b/genomix/genomix-hadoop/src/main/java/edu/.DS_Store
new file mode 100644
index 0000000..5e0c641
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/.DS_Store b/genomix/genomix-hadoop/src/main/java/edu/uci/.DS_Store
new file mode 100644
index 0000000..4f27e83
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/.DS_Store b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/.DS_Store
new file mode 100644
index 0000000..8f46380
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/.DS_Store b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/.DS_Store
new file mode 100644
index 0000000..f5eb144
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingDriver.java
new file mode 100644
index 0000000..132f6e0
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingDriver.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.gbresultschecking;
+
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+@SuppressWarnings("deprecation")
+public class ResultsCheckingDriver {
+ private static class Options {
+ @Option(name = "-inputpath1", usage = "the input path", required = true)
+ public String inputPath1;
+
+ @Option(name = "-inputpath2", usage = "the input path", required = true)
+ public String inputPath2;
+
+ @Option(name = "-outputpath", usage = "the output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+ public int numReducers;
+
+ @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
+ public int sizeKmer;
+
+ }
+
+ public void run(String inputPath1, String inputPath2, String outputPath, int numReducers, int sizeKmer,
+ String defaultConfPath) throws IOException {
+
+ JobConf conf = new JobConf(ResultsCheckingDriver.class);
+
+ conf.setInt("sizeKmer", sizeKmer);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+
+ conf.setJobName("Results Checking");
+ conf.setMapperClass(ResultsCheckingMapper.class);
+ conf.setReducerClass(ResultsCheckingReducer.class);
+
+ conf.setMapOutputKeyClass(Text.class);
+ conf.setMapOutputValueClass(Text.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(Text.class);
+
+ Path[] inputList = new Path[2];
+ inputList[0] = new Path(inputPath1);
+ inputList[1] = new Path(inputPath2);
+
+ FileInputFormat.setInputPaths(conf, inputList);
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+
+ FileSystem dfs = FileSystem.get(conf);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+ ResultsCheckingDriver driver = new ResultsCheckingDriver();
+ driver.run(options.inputPath1, options.inputPath2, options.outputPath, options.numReducers, options.sizeKmer,
+ null);
+ }
+
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingMapper.java
new file mode 100644
index 0000000..dca3808
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingMapper.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.gbresultschecking;
+
+import java.io.IOException;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+@SuppressWarnings({ "unused", "deprecation" })
+public class ResultsCheckingMapper extends MapReduceBase implements Mapper<KmerBytesWritable, KmerCountValue, Text, Text> {
+ KmerBytesWritable valWriter;
+ private final static IntWritable one = new IntWritable(1);
+ public static Text textkey = new Text();
+ public static Text textvalue = new Text();
+ public static String INPUT_PATH;
+ public static int KMER_SIZE;
+
+ public void configure(JobConf job) {
+ KMER_SIZE = job.getInt("sizeKmer", 0);
+ valWriter= new KmerBytesWritable(KMER_SIZE);
+ }
+
+ @Override
+ public void map(KmerBytesWritable key, KmerCountValue value, OutputCollector<Text, Text> output, Reporter reporter)
+ throws IOException {
+
+ FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
+ String filename = fileSplit.getPath().getName();
+ textkey.set(key.toString() + "\t" + value.toString());
+ textvalue.set(filename);
+ output.collect(textkey, textvalue);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingReducer.java
new file mode 100644
index 0000000..6f02136
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/gbresultschecking/ResultsCheckingReducer.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.gbresultschecking;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+@SuppressWarnings("deprecation")
+public class ResultsCheckingReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
+
+ public static Text textkey = new Text();
+ public static Text textvalue = new Text();
+
+ @Override
+ public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
+ throws IOException {
+ textkey.set(key);
+ textvalue.set(values.next());
+ if (values.hasNext() == false) {
+ output.collect(textkey, textvalue);
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
new file mode 100755
index 0000000..09a1bf1
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixCombiner.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.graphbuilding;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+/**
+ * This class implement the combiner operator of Mapreduce model
+ */
+@SuppressWarnings("deprecation")
+public class GenomixCombiner extends MapReduceBase implements
+ Reducer<KmerBytesWritable, KmerCountValue, KmerBytesWritable, KmerCountValue> {
+ private KmerCountValue vaWriter = new KmerCountValue();
+
+ @Override
+ public void reduce(KmerBytesWritable key, Iterator<KmerCountValue> values,
+ OutputCollector<KmerBytesWritable, KmerCountValue> output, Reporter reporter) throws IOException {
+ byte groupByAdjList = 0;
+ int count = 0;
+ byte bytCount = 0;
+ while (values.hasNext()) {
+ //Merge By the all adjacent Nodes;
+ KmerCountValue geneValue = values.next();
+ groupByAdjList = (byte) (groupByAdjList | geneValue.getAdjBitMap());
+ count = count + (int) geneValue.getCount();
+ }
+ if (count >= 127)
+ bytCount = (byte) 127;
+ else
+ bytCount = (byte) count;
+ vaWriter.set(groupByAdjList, bytCount);
+ output.collect(key, vaWriter);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
new file mode 100755
index 0000000..e2f0f85
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixDriver.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.graphbuilding;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+/**
+ * This class implement driver which start the mapreduce program for graphbuilding
+ */
+@SuppressWarnings("deprecation")
+public class GenomixDriver {
+ private static class Options {
+ @Option(name = "-inputpath", usage = "the input path", required = true)
+ public String inputPath;
+
+ @Option(name = "-outputpath", usage = "the output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+ public int numReducers;
+
+ @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
+ public int sizeKmer;
+ }
+
+ public void run(String inputPath, String outputPath, int numReducers, int sizeKmer, String defaultConfPath)
+ throws IOException {
+
+ JobConf conf = new JobConf(GenomixDriver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+
+ conf.setJobName("Genomix Graph Building");
+ conf.setMapperClass(GenomixMapper.class);
+ conf.setReducerClass(GenomixReducer.class);
+ conf.setCombinerClass(GenomixCombiner.class);
+
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
+ conf.setMapOutputValueClass(KmerCountValue.class);
+
+ conf.setInputFormat(TextInputFormat.class);
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+ conf.setOutputKeyClass(KmerBytesWritable.class);
+ conf.setOutputValueClass(KmerCountValue.class);
+ FileInputFormat.setInputPaths(conf, new Path(inputPath));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+
+ FileSystem dfs = FileSystem.get(conf);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+ GenomixDriver driver = new GenomixDriver();
+ driver.run(options.inputPath, options.outputPath, options.numReducers, options.sizeKmer, null);
+ }
+
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
new file mode 100755
index 0000000..f3f4584
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixMapper.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.graphbuilding;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+/**
+ * This class implement mapper operator of mapreduce model
+ */
+@SuppressWarnings("deprecation")
+public class GenomixMapper extends MapReduceBase implements
+ Mapper<LongWritable, Text, KmerBytesWritable, KmerCountValue> {
+
+ public class CurrenByte {
+ public byte curByte;
+ public byte preMarker;
+ }
+
+ public static int KMER_SIZE;
+ public KmerCountValue outputAdjList;
+ public KmerBytesWritable outputKmer;
+
+ @Override
+ public void configure(JobConf job) {
+ KMER_SIZE = Integer.parseInt(job.get("sizeKmer"));
+ outputAdjList = new KmerCountValue();
+ outputKmer = new KmerBytesWritable(KMER_SIZE);
+ }
+
+ /*succeed node
+ A 00000001 1
+ C 00000010 2
+ G 00000100 4
+ T 00001000 8
+ precursor node
+ A 00010000 16
+ C 00100000 32
+ G 01000000 64
+ T 10000000 128*/
+ @Override
+ public void map(LongWritable key, Text value, OutputCollector<KmerBytesWritable, KmerCountValue> output,
+ Reporter reporter) throws IOException {
+ /* A 00
+ C 01
+ G 10
+ T 11*/
+ String geneLine = value.toString(); // Read the Real Gene Line
+ Pattern genePattern = Pattern.compile("[AGCT]+");
+ Matcher geneMatcher = genePattern.matcher(geneLine);
+ boolean isValid = geneMatcher.matches();
+ if (isValid == true) {
+ /** first kmer */
+ byte count = 1;
+ byte[] array = geneLine.getBytes();
+ outputKmer.setByRead( array, 0);
+ byte pre = 0;
+ byte next = GeneCode.getAdjBit(array[KMER_SIZE]);
+ byte adj = GeneCode.mergePreNextAdj(pre, next);
+ outputAdjList.set(adj, count);
+ output.collect(outputKmer, outputAdjList);
+ /** middle kmer */
+ for (int i = KMER_SIZE; i < array.length - 1; i++) {
+ pre = GeneCode.getBitMapFromGeneCode(outputKmer.shiftKmerWithNextChar(array[i]));
+ next = GeneCode.getAdjBit(array[i + 1]);
+ adj = GeneCode.mergePreNextAdj(pre, next);
+ outputAdjList.set(adj, count);
+ output.collect(outputKmer, outputAdjList);
+ }
+ /** last kmer */
+ pre = GeneCode.getBitMapFromGeneCode(outputKmer.shiftKmerWithNextChar(array[array.length - 1]));
+ next = 0;
+ adj = GeneCode.mergePreNextAdj(pre, next);
+ outputAdjList.set(adj, count);
+ output.collect(outputKmer, outputAdjList);
+ }
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
new file mode 100755
index 0000000..1fba709
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphbuilding/GenomixReducer.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.graphbuilding;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+/**
+ * This class implement reducer operator of mapreduce model
+ */
+@SuppressWarnings("deprecation")
+public class GenomixReducer extends MapReduceBase implements
+ Reducer<KmerBytesWritable, KmerCountValue, KmerBytesWritable, KmerCountValue> {
+ KmerCountValue valWriter = new KmerCountValue();
+ static enum MyCounters { NUM_RECORDS };
+ @Override
+ public void reduce(KmerBytesWritable key, Iterator<KmerCountValue> values,
+ OutputCollector<KmerBytesWritable, KmerCountValue> output, Reporter reporter) throws IOException {
+ byte groupByAdjList = 0;
+ int count = 0;
+ byte bytCount = 0;
+ while (values.hasNext()) {
+ //Merge By the all adjacent Nodes;
+ KmerCountValue geneValue = values.next();
+ groupByAdjList = (byte) (groupByAdjList | geneValue.getAdjBitMap());
+ count = count + (int) geneValue.getCount();
+ }
+ if (count >= 127)
+ bytCount = (byte) 127;
+ else
+ bytCount = (byte) count;
+ valWriter.set(groupByAdjList, bytCount);
+ output.collect(key, valWriter);
+ reporter.incrCounter(MyCounters.NUM_RECORDS, 1);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterDriver.java
new file mode 100644
index 0000000..d54eca2
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterDriver.java
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.graphcountfilter;
+
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+@SuppressWarnings("deprecation")
+public class CountFilterDriver {
+ private static class Options {
+ @Option(name = "-inputpath", usage = "the input path", required = true)
+ public String inputPath;
+
+ @Option(name = "-outputpath", usage = "the output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+ public int numReducers;
+
+ @Option(name = "-count-threshold", usage = "the threshold of count", required = true)
+ public int countThreshold;
+ }
+
+ public void run(String inputPath, String outputPath, int numReducers, int countThreshold, String defaultConfPath)
+ throws IOException {
+
+ JobConf conf = new JobConf(CountFilterDriver.class);
+ conf.setInt("countThreshold", countThreshold);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+
+ conf.setJobName("Count Filter");
+ conf.setMapperClass(CountFilterMapper.class);
+ conf.setReducerClass(CountFilterReducer.class);
+ conf.setCombinerClass(CountFilterReducer.class);
+
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
+ conf.setMapOutputValueClass(ByteWritable.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+
+ conf.setOutputKeyClass(KmerBytesWritable.class);
+ conf.setOutputValueClass(ByteWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+
+ FileSystem dfs = FileSystem.get(conf);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+ CountFilterDriver driver = new CountFilterDriver();
+ driver.run(options.inputPath, options.outputPath, options.numReducers, options.countThreshold, null);
+ }
+
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterMapper.java
new file mode 100644
index 0000000..2a54217
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterMapper.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.graphcountfilter;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+
+@SuppressWarnings({ "deprecation" })
+public class CountFilterMapper extends MapReduceBase implements
+ Mapper<KmerBytesWritable, KmerCountValue, KmerBytesWritable, ByteWritable> {
+ private int THRESHOLD;
+ private ByteWritable adjByte = new ByteWritable();
+ @Override
+ public void configure(JobConf job) {
+ THRESHOLD = Integer.parseInt(job.get("countThreshold"));
+ }
+ public void map(KmerBytesWritable key, KmerCountValue value, OutputCollector<KmerBytesWritable, ByteWritable> output,
+ Reporter reporter) throws IOException {
+ if(value.getCount() >= THRESHOLD){
+ adjByte.set(value.getAdjBitMap());
+ output.collect(key, adjByte );
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterReducer.java
new file mode 100644
index 0000000..dd33451
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/graphcountfilter/CountFilterReducer.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.graphcountfilter;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+@SuppressWarnings("deprecation")
+public class CountFilterReducer extends MapReduceBase implements
+ Reducer<KmerBytesWritable, ByteWritable, KmerBytesWritable, ByteWritable> {
+ @Override
+ public void reduce(KmerBytesWritable key, Iterator<ByteWritable> values,
+ OutputCollector<KmerBytesWritable, ByteWritable> output, Reporter reporter) throws IOException {
+ output.collect(key, values.next()); //Output the Pair
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/.DS_Store b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/.DS_Store
new file mode 100644
index 0000000..f9e3926
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathDriver.java
new file mode 100644
index 0000000..36c12ae
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathDriver.java
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+@SuppressWarnings("deprecation")
+public class MergePathDriver {
+
+ private static class Options {
+ @Option(name = "-inputpath", usage = "the input path", required = true)
+ public String inputPath;
+
+ @Option(name = "-outputpath", usage = "the output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-mergeresultpath", usage = "the merging results path", required = true)
+ public String mergeResultPath;
+
+ @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+ public int numReducers;
+
+ @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
+ public int sizeKmer;
+
+ @Option(name = "-merge-rounds", usage = "the while rounds of merging", required = true)
+ public int mergeRound;
+
+ }
+
+
+ public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer, int mergeRound, String defaultConfPath)
+ throws IOException{
+
+ JobConf conf = new JobConf(MergePathDriver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+ conf.setJobName("Initial Path-Starting-Points Table");
+ conf.setMapperClass(SNodeInitialMapper.class);
+ conf.setReducerClass(SNodeInitialReducer.class);
+
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
+ conf.setMapOutputValueClass(MergePathValueWritable.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+
+ conf.setOutputKeyClass(VKmerBytesWritable.class);
+ conf.setOutputValueClass(MergePathValueWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath));
+ FileOutputFormat.setOutputPath(conf, new Path(inputPath + "-step1"));
+ conf.setNumReduceTasks(numReducers);
+ FileSystem dfs = FileSystem.get(conf);
+ dfs.delete(new Path(inputPath + "-step1"), true);
+ JobClient.runJob(conf);
+ int iMerge = 0;
+/*----------------------------------------------------------------------*/
+ for(iMerge = 0; iMerge < mergeRound; iMerge ++){
+
+ conf = new JobConf(MergePathDriver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+ conf.setInt("iMerge", iMerge);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+ conf.setJobName("Path Merge");
+
+ conf.setMapperClass(MergePathMapper.class);
+ conf.setReducerClass(MergePathReducer.class);
+
+ conf.setMapOutputKeyClass(VKmerBytesWritable.class);
+ conf.setMapOutputValueClass(MergePathValueWritable.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+
+ String uncomplete = "uncomplete" + iMerge;
+ String complete = "complete" + iMerge;
+
+ MultipleOutputs.addNamedOutput(conf, uncomplete,
+ MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ MultipleOutputs.addNamedOutput(conf, complete,
+ MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ conf.setOutputKeyClass(VKmerBytesWritable.class);
+ conf.setOutputValueClass(MergePathValueWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath + "-step1"));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ dfs.delete(new Path(inputPath + "-step1"), true);
+ dfs.rename(new Path(outputPath + "/" + uncomplete), new Path(inputPath + "-step1"));
+ dfs.rename(new Path(outputPath + "/" + complete), new Path(mergeResultPath + "/" + complete));
+ }
+ /*----------------------------------------*/
+ conf = new JobConf(MergePathDriver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+ conf.setInt("iMerge", iMerge);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+ conf.setJobName("Path Merge");
+
+ conf.setMapperClass(MergePathMapper.class);
+ conf.setReducerClass(MergePathReducer.class);
+
+ conf.setMapOutputKeyClass(VKmerBytesWritable.class);
+ conf.setMapOutputValueClass(MergePathValueWritable.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+
+ String uncomplete = "uncomplete" + iMerge;
+ String complete = "complete" + iMerge;
+
+ MultipleOutputs.addNamedOutput(conf, uncomplete,
+ MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ MultipleOutputs.addNamedOutput(conf, complete,
+ MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ conf.setOutputKeyClass(VKmerBytesWritable.class);
+ conf.setOutputValueClass(MergePathValueWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath + "-step1"));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ dfs.delete(new Path(inputPath + "-step1"), true);
+ dfs.rename(new Path(outputPath + "/" + uncomplete), new Path(inputPath + "-step1"));
+ dfs.rename(new Path(outputPath + "/" + complete), new Path(mergeResultPath + "/" + complete));
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+ MergePathDriver driver = new MergePathDriver();
+ driver.run(options.inputPath, options.outputPath, options.mergeResultPath, options.numReducers, options.sizeKmer, options.mergeRound, null);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMapper.java
new file mode 100644
index 0000000..0c7dcc1
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMapper.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.IOException;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+@SuppressWarnings("deprecation")
+public class MergePathMapper extends MapReduceBase implements
+ Mapper<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+ private int KMER_SIZE;
+ private VKmerBytesWritableFactory outputKmerFactory;
+ private MergePathValueWritable outputValue;
+ private VKmerBytesWritable tmpKmer;
+ private VKmerBytesWritable outputKmer;
+
+
+ public void configure(JobConf job) {
+ KMER_SIZE = job.getInt("sizeKmer", 0);
+ outputKmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputValue = new MergePathValueWritable();
+ tmpKmer = new VKmerBytesWritable(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
+ }
+
+ @Override
+ public void map(VKmerBytesWritable key, MergePathValueWritable value,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+
+ byte precursor = (byte) 0xF0;
+ byte succeed = (byte) 0x0F;
+ byte adjBitMap = value.getAdjBitMap();
+ byte bitFlag = value.getFlag();
+ precursor = (byte) (precursor & adjBitMap);
+ precursor = (byte) ((precursor & 0xff) >> 4);
+ succeed = (byte) (succeed & adjBitMap);
+ if (bitFlag == 1) {
+ byte succeedCode = GeneCode.getGeneCodeFromBitMap(succeed);
+ tmpKmer.set(outputKmerFactory.getLastKmerFromChain(KMER_SIZE, key));
+ outputKmer.set(outputKmerFactory.shiftKmerWithNextCode(tmpKmer, succeedCode));
+
+ tmpKmer.set(outputKmerFactory.getFirstKmerFromChain(key.getKmerLength() - (KMER_SIZE - 1), key));
+ outputValue.set(adjBitMap, bitFlag, tmpKmer);
+ output.collect(outputKmer, outputValue);
+ } else {
+ outputKmer.set(key);
+ outputValue.set(value);
+ output.collect(key, outputValue);
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMultiSeqOutputFormat.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMultiSeqOutputFormat.java
new file mode 100644
index 0000000..bd4cd2a
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMultiSeqOutputFormat.java
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.File;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+
+public class MergePathMultiSeqOutputFormat extends MultipleSequenceFileOutputFormat<VKmerBytesWritable, MergePathValueWritable>{
+ @Override
+ protected String generateLeafFileName(String name) {
+ // TODO Auto-generated method stub System.out.println(name);
+ String[] names = name.split("-");
+ return names[0] + File.separator + name;
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMultiTextOutputFormat.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMultiTextOutputFormat.java
new file mode 100644
index 0000000..29d3b68
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathMultiTextOutputFormat.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.File;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
+
+public class MergePathMultiTextOutputFormat extends MultipleTextOutputFormat<Text, Text>{
+ @Override
+ protected String generateLeafFileName(String name) {
+ // TODO Auto-generated method stub System.out.println(name);
+ String[] names = name.split("-");
+ return names[0] + File.separator + name;
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathReducer.java
new file mode 100644
index 0000000..52abc1c
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathReducer.java
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+@SuppressWarnings("deprecation")
+public class MergePathReducer extends MapReduceBase implements
+ Reducer<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+ private VKmerBytesWritableFactory kmerFactory;
+ private VKmerBytesWritable outputKmer;
+ private VKmerBytesWritable tmpKmer;
+ private int KMER_SIZE;
+ private MergePathValueWritable outputValue;
+ private MergePathValueWritable tmpOutputValue;
+ MultipleOutputs mos = null;
+ private int I_MERGE;
+
+ public void configure(JobConf job) {
+ mos = new MultipleOutputs(job);
+ I_MERGE = Integer.parseInt(job.get("iMerge"));
+ KMER_SIZE = job.getInt("sizeKmer", 0);
+ outputValue = new MergePathValueWritable();
+ tmpOutputValue = new MergePathValueWritable();
+ kmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
+ tmpKmer = new VKmerBytesWritable(KMER_SIZE);
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public void reduce(VKmerBytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ outputValue = values.next();
+ if (values.hasNext() == true) {
+ if (outputValue.getFlag() != 1) {
+ byte nextAdj = outputValue.getAdjBitMap();
+ byte succeed = (byte) 0x0F;
+ succeed = (byte) (succeed & nextAdj);
+
+ outputValue = values.next();
+ byte adjBitMap = outputValue.getAdjBitMap();
+ byte flag = outputValue.getFlag();
+ if (outputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputValue.getKmer(), key));
+ else
+ outputKmer.set(key);
+
+ adjBitMap = (byte) (adjBitMap & 0xF0);
+ adjBitMap = (byte) (adjBitMap | succeed);
+ outputValue.set(adjBitMap, flag, null);
+ mos.getCollector("uncomplete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+ } else {
+ tmpOutputValue.set(outputValue);
+ byte tmpAdjMap = tmpOutputValue.getAdjBitMap();
+
+ outputValue = values.next();
+ if (outputValue.getFlag() != 1) {
+ if (tmpOutputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(tmpOutputValue.getKmer(), key));
+ else
+ outputKmer.set(key);
+
+ byte nextAdj = outputValue.getAdjBitMap();
+ byte succeed = (byte) 0x0F;
+ succeed = (byte) (succeed & nextAdj);
+ tmpAdjMap = (byte) (tmpAdjMap & 0xF0);
+ tmpAdjMap = (byte) (tmpAdjMap | succeed);
+ outputValue.set(tmpAdjMap, tmpOutputValue.getFlag(), null);
+ mos.getCollector("uncomplete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+ } else {
+
+ tmpKmer.set(kmerFactory.getFirstKmerFromChain(KMER_SIZE - 1, key));
+ if (tmpOutputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(tmpOutputValue.getKmer(), tmpKmer));
+ else
+ outputKmer.set(tmpKmer);
+ tmpOutputValue.set(tmpAdjMap, tmpOutputValue.getFlag(), null);
+ mos.getCollector("complete" + I_MERGE, reporter).collect(outputKmer, tmpOutputValue);
+
+ tmpKmer.set(kmerFactory.getFirstKmerFromChain(KMER_SIZE - 1, key));
+ if (outputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputValue.getKmer(), tmpKmer));
+ else
+ outputKmer.set(tmpKmer);
+ outputValue.set(outputValue.getAdjBitMap(), outputValue.getFlag(), null);
+ mos.getCollector("complete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+
+ while (values.hasNext()) {
+ outputValue = values.next();
+ tmpKmer.set(kmerFactory.getFirstKmerFromChain(KMER_SIZE - 1, key));
+ if (outputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputValue.getKmer(), tmpKmer));
+ else
+ outputKmer.set(tmpKmer);
+ outputValue.set(outputValue.getAdjBitMap(), outputValue.getFlag(), null);
+ mos.getCollector("complete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+ }
+ }
+ }
+ } else {
+ if (outputValue.getFlag() != 0) {
+ tmpKmer.set(kmerFactory.getFirstKmerFromChain(KMER_SIZE - 1, key));
+ if (outputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputValue.getKmer(), tmpKmer));
+ else
+ outputKmer.set(tmpKmer);
+ outputValue.set(outputValue.getAdjBitMap(), outputValue.getFlag(), null);
+ mos.getCollector("complete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+
+ } else
+ mos.getCollector("uncomplete" + I_MERGE, reporter).collect(key, outputValue);
+ }
+ }
+
+ public void close() throws IOException {
+ // TODO Auto-generated method stub
+ mos.close();
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathValueWritable.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathValueWritable.java
new file mode 100644
index 0000000..9686c18
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/MergePathValueWritable.java
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class MergePathValueWritable extends BinaryComparable implements WritableComparable<BinaryComparable> {
+
+ private static final byte[] EMPTY_BYTES = {};
+ private byte adjBitMap;
+ private byte flag;
+ private VKmerBytesWritable kmer;
+
+ public MergePathValueWritable() {
+ this((byte) 0, (byte) 0, 0, EMPTY_BYTES);
+ }
+
+ public MergePathValueWritable(byte adjBitMap, byte flag, int kmerSize, byte[] bytes) {
+ this.adjBitMap = adjBitMap;
+ this.flag = flag;
+ this.kmer = new VKmerBytesWritable(kmerSize, bytes);
+ kmer.set(bytes, 0, bytes.length);
+ }
+
+ public void set(MergePathValueWritable right) {
+ set(right.getAdjBitMap(), right.getFlag(), right.getKmer());
+ }
+
+ public void set(byte adjBitMap, byte flag, VKmerBytesWritable kmer) {
+ this.kmer.set(kmer);
+ this.adjBitMap = adjBitMap;
+ this.flag = flag;
+ }
+
+ @Override
+ public void readFields(DataInput arg0) throws IOException {
+ // TODO Auto-generated method stub
+ kmer.readFields(arg0);
+ adjBitMap = arg0.readByte();
+ flag = arg0.readByte();
+ }
+
+ @Override
+ public void write(DataOutput arg0) throws IOException {
+ // TODO Auto-generated method stub
+
+ kmer.write(arg0);
+ arg0.writeByte(adjBitMap);
+ arg0.writeByte(flag);
+ }
+
+ public VKmerBytesWritable getKmer() {
+ if (kmer.getLength() != 0) {
+ return kmer;
+ }
+ return null;
+ }
+
+ public byte getAdjBitMap() {
+ return this.adjBitMap;
+ }
+
+ public byte getFlag() {
+ return this.flag;
+ }
+
+ public String toString() {
+ return GeneCode.getSymbolFromBitMap(adjBitMap) + '\t' + String.valueOf(flag);
+ }
+
+ @Override
+ public byte[] getBytes() {
+ // TODO Auto-generated method stub
+ if (kmer.getLength() != 0) {
+ return kmer.getBytes();
+ } else
+ return null;
+
+ }
+
+ public int getKmerLength() {
+ return kmer.getKmerLength();
+ }
+
+ @Override
+ public int getLength() {
+ return kmer.getLength();
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialMapper.java
new file mode 100644
index 0000000..1058fda
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialMapper.java
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.GeneCode;
+
+@SuppressWarnings("deprecation")
+public class SNodeInitialMapper extends MapReduceBase implements
+ Mapper<KmerBytesWritable, ByteWritable, KmerBytesWritable, MergePathValueWritable> {
+
+ public int KMER_SIZE;
+ public KmerBytesWritable outputKmer;
+ public MergePathValueWritable outputAdjList;
+
+ public void configure(JobConf job) {
+ KMER_SIZE = Integer.parseInt(job.get("sizeKmer"));
+ outputKmer = new KmerBytesWritable(KMER_SIZE);
+ outputAdjList = new MergePathValueWritable();
+ }
+
+ boolean measureDegree(byte adjacent) {
+ boolean result = true;
+ switch (adjacent) {
+ case 0:
+ result = true;
+ break;
+ case 1:
+ result = false;
+ break;
+ case 2:
+ result = false;
+ break;
+ case 3:
+ result = true;
+ break;
+ case 4:
+ result = false;
+ break;
+ case 5:
+ result = true;
+ break;
+ case 6:
+ result = true;
+ break;
+ case 7:
+ result = true;
+ break;
+ case 8:
+ result = false;
+ break;
+ case 9:
+ result = true;
+ break;
+ case 10:
+ result = true;
+ break;
+ case 11:
+ result = true;
+ break;
+ case 12:
+ result = true;
+ break;
+ case 13:
+ result = true;
+ break;
+ case 14:
+ result = true;
+ break;
+ case 15:
+ result = true;
+ break;
+ }
+ return result;
+ }
+
+ @Override
+ public void map(KmerBytesWritable key, ByteWritable value,
+ OutputCollector<KmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ byte precursor = (byte) 0xF0;
+ byte succeed = (byte) 0x0F;
+ byte adjBitMap = value.get();
+ byte bitFlag = (byte) 0;
+ precursor = (byte) (precursor & adjBitMap);
+ precursor = (byte) ((precursor & 0xff) >> 4);
+ succeed = (byte) (succeed & adjBitMap);
+ boolean inDegree = measureDegree(precursor);
+ boolean outDegree = measureDegree(succeed);
+ if (inDegree == false && outDegree == false) {
+ outputKmer.set(key);
+ bitFlag = (byte) 2;
+ outputAdjList.set(adjBitMap, bitFlag, null);///~~~~~kmersize----->0
+ output.collect(outputKmer, outputAdjList);
+ }
+ else{
+ for(int i = 0 ; i < 4; i ++){
+ byte temp = 0x01;
+ byte shiftedCode = 0;
+ temp = (byte)(temp << i);
+ temp = (byte) (succeed & temp);
+ if(temp != 0 ){
+ byte succeedCode = GeneCode.getGeneCodeFromBitMap(temp);
+ shiftedCode = key.shiftKmerWithNextCode(succeedCode);
+ outputKmer.set(key);
+ outputAdjList.set((byte)0, bitFlag, null);
+ output.collect(outputKmer, outputAdjList);
+ key.shiftKmerWithPreCode(shiftedCode);
+ }
+ }
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialReducer.java
new file mode 100644
index 0000000..07cc32f
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmerging/SNodeInitialReducer.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmerging;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+@SuppressWarnings("deprecation")
+public class SNodeInitialReducer extends MapReduceBase implements
+ Reducer<KmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+ private VKmerBytesWritable outputKmer = new VKmerBytesWritable();
+ private MergePathValueWritable outputValue = new MergePathValueWritable();
+
+
+ @Override
+ public void reduce(KmerBytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ outputKmer.set(key);
+ outputValue = values.next();
+ if (values.hasNext() == true) {
+ if (outputValue.getFlag() == 2) {
+ byte bitFlag = 1;
+ outputValue.set(outputValue.getAdjBitMap(), bitFlag, null);///outputValue.getKmerLength()
+ output.collect(outputKmer, outputValue);
+ } else {
+ boolean flag = false;
+ while (values.hasNext()) {
+ outputValue = values.next();
+ if (outputValue.getFlag() == 2) {
+ flag = true;
+ break;
+ }
+ }
+ if (flag == true) {
+ byte bitFlag = 1;
+ outputValue.set(outputValue.getAdjBitMap(), bitFlag, null);
+ output.collect(outputKmer, outputValue);
+ }
+ }
+ } else {
+ if (outputValue.getFlag() == 2) {
+ byte bitFlag = 0;
+ outputValue.set(outputValue.getAdjBitMap(), bitFlag, null);
+ output.collect(outputKmer, outputValue);
+ }
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/.DS_Store b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/.DS_Store
new file mode 100644
index 0000000..7c4ae29
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/ENodeInitialReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/ENodeInitialReducer.java
new file mode 100644
index 0000000..1f9bc82
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/ENodeInitialReducer.java
@@ -0,0 +1,58 @@
+package edu.uci.ics.pathmergingh2;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerUtil;
+
+@SuppressWarnings("deprecation")
+public class ENodeInitialReducer extends MapReduceBase implements
+ Reducer<BytesWritable, MergePathValueWritable, BytesWritable, MergePathValueWritable> {
+ public BytesWritable outputKmer = new BytesWritable();
+ public MergePathValueWritable outputAdjList = new MergePathValueWritable();
+
+ @Override
+ public void reduce(BytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<BytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ outputAdjList = values.next();
+ outputKmer.set(key);
+ if (values.hasNext() == true) {
+ byte bitFlag = outputAdjList.getFlag();
+ bitFlag = (byte) (bitFlag & 0xFE);
+ if (bitFlag == 2) {
+ bitFlag = (byte) (0x80 | outputAdjList.getFlag());
+ outputAdjList.set(outputAdjList.getAdjBitMap(), bitFlag, null);
+ output.collect(outputKmer, outputAdjList);
+
+ } else {
+ boolean flag = false;
+ while (values.hasNext()) {
+ outputAdjList = values.next();
+ if (outputAdjList.getFlag() == 2) {
+ flag = true;
+ break;
+ }
+ }
+ if (flag == true) {
+ bitFlag = (byte) (0x80 | outputAdjList.getFlag());
+ outputAdjList.set(outputAdjList.getAdjBitMap(), bitFlag, null);
+ output.collect(outputKmer, outputAdjList);
+ }
+ }
+ } else {
+ byte bitFlag = outputAdjList.getFlag();
+ bitFlag = (byte) (bitFlag & 0xFE);
+ if (bitFlag == 2) {
+ bitFlag = 0;
+ outputAdjList.set(outputAdjList.getAdjBitMap(), bitFlag, null);
+ output.collect(outputKmer, outputAdjList);
+ }
+ }
+ }
+}
+
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Driver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Driver.java
new file mode 100644
index 0000000..58849e2
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Driver.java
@@ -0,0 +1,166 @@
+package edu.uci.ics.pathmergingh2;
+
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+
+@SuppressWarnings("deprecation")
+public class MergePathH2Driver {
+
+ private static class Options {
+ @Option(name = "-inputpath", usage = "the input path", required = true)
+ public String inputPath;
+
+ @Option(name = "-outputpath", usage = "the output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-mergeresultpath", usage = "the merging results path", required = true)
+ public String mergeResultPath;
+
+ @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+ public int numReducers;
+
+ @Option(name = "-kmer-size", usage = "the size of kmer", required = true)
+ public int sizeKmer;
+
+ @Option(name = "-merge-rounds", usage = "the while rounds of merging", required = true)
+ public int mergeRound;
+
+ }
+
+
+ public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer, int mergeRound, String defaultConfPath)
+ throws IOException{
+
+ JobConf conf = new JobConf(MergePathH2Driver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+ conf.setJobName("Initial Path-Starting-Points Table");
+ conf.setMapperClass(SNodeInitialMapper.class);
+ conf.setReducerClass(SNodeInitialReducer.class);
+
+ conf.setMapOutputKeyClass(KmerBytesWritable.class);
+ conf.setMapOutputValueClass(MergePathValueWritable.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+
+ conf.setOutputKeyClass(VKmerBytesWritable.class);
+ conf.setOutputValueClass(MergePathValueWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath));
+ FileOutputFormat.setOutputPath(conf, new Path(inputPath + "-step1"));
+ conf.setNumReduceTasks(numReducers);
+ FileSystem dfs = FileSystem.get(conf);
+ dfs.delete(new Path(inputPath + "-step1"), true);
+ JobClient.runJob(conf);
+ int iMerge = 0;
+/*----------------------------------------------------------------------*/
+ for(iMerge = 0; iMerge < mergeRound; iMerge ++){
+ conf = new JobConf(MergePathH2Driver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+ conf.setInt("iMerge", iMerge);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+ conf.setJobName("Path Merge");
+
+ conf.setMapperClass(MergePathH2Mapper.class);
+ conf.setReducerClass(MergePathH2Reducer.class);
+
+ conf.setMapOutputKeyClass(VKmerBytesWritable.class);
+ conf.setMapOutputValueClass(MergePathValueWritable.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+
+ String uncomplete = "uncomplete" + iMerge;
+ String complete = "complete" + iMerge;
+
+ MultipleOutputs.addNamedOutput(conf, uncomplete,
+ MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ MultipleOutputs.addNamedOutput(conf, complete,
+ MergePathMultiTextOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ conf.setOutputKeyClass(VKmerBytesWritable.class);
+ conf.setOutputValueClass(MergePathValueWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath + "-step1"));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ dfs.delete(new Path(inputPath + "-step1"), true);
+ dfs.rename(new Path(outputPath + "/" + uncomplete), new Path(inputPath + "-step1"));
+ dfs.rename(new Path(outputPath + "/" + complete), new Path(mergeResultPath + "/" + complete));
+ }
+ conf = new JobConf(MergePathH2Driver.class);
+ conf.setInt("sizeKmer", sizeKmer);
+ conf.setInt("iMerge", iMerge);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+ conf.setJobName("Path Merge");
+
+ conf.setMapperClass(MergePathH2Mapper.class);
+ conf.setReducerClass(MergePathH2Reducer.class);
+
+ conf.setMapOutputKeyClass(VKmerBytesWritable.class);
+ conf.setMapOutputValueClass(MergePathValueWritable.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+
+ String uncomplete = "uncomplete" + iMerge;
+ String complete = "complete" + iMerge;
+
+ MultipleOutputs.addNamedOutput(conf, uncomplete,
+ MergePathMultiTextOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ MultipleOutputs.addNamedOutput(conf, complete,
+ MergePathMultiTextOutputFormat.class, VKmerBytesWritable.class,
+ MergePathValueWritable.class);
+
+ conf.setOutputKeyClass(VKmerBytesWritable.class);
+ conf.setOutputValueClass(MergePathValueWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath + "-step1"));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ dfs.delete(new Path(inputPath + "-step1"), true);
+ dfs.rename(new Path(outputPath + "/" + uncomplete), new Path(inputPath + "-step1"));
+ dfs.rename(new Path(outputPath + "/" + complete), new Path(mergeResultPath + "/" + complete));
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+ MergePathH2Driver driver = new MergePathH2Driver();
+ driver.run(options.inputPath, options.outputPath, options.mergeResultPath, options.numReducers, options.sizeKmer, options.mergeRound, null);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Mapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Mapper.java
new file mode 100644
index 0000000..6ea9dd3
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Mapper.java
@@ -0,0 +1,87 @@
+package edu.uci.ics.pathmergingh2;
+
+import java.io.IOException;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+@SuppressWarnings("deprecation")
+public class MergePathH2Mapper extends MapReduceBase implements
+ Mapper<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+
+ private int KMER_SIZE;
+ private VKmerBytesWritableFactory outputKmerFactory;
+ private MergePathValueWritable outputValue;
+ private VKmerBytesWritable tmpKmer;
+ private VKmerBytesWritable outputKmer;
+
+ public void configure(JobConf job) {
+ KMER_SIZE = job.getInt("sizeKmer", 0);
+ outputKmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputValue = new MergePathValueWritable();
+ tmpKmer = new VKmerBytesWritable(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
+ }
+
+ @Override
+ public void map(VKmerBytesWritable key, MergePathValueWritable value,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ byte precursor = (byte) 0xF0;
+ byte succeed = (byte) 0x0F;
+ byte adjBitMap = value.getAdjBitMap();
+ byte bitFlag = value.getFlag();
+ precursor = (byte) (precursor & adjBitMap);
+ precursor = (byte) ((precursor & 0xff) >> 4);
+ succeed = (byte) (succeed & adjBitMap);
+ byte bitStartEnd = (byte) (0x81 & bitFlag);
+
+ switch (bitStartEnd) {
+ case (byte) 0x01:
+ byte succeedCode = GeneCode.getGeneCodeFromBitMap(succeed);
+ tmpKmer.set(outputKmerFactory.getLastKmerFromChain(KMER_SIZE, key));
+ outputKmer.set(outputKmerFactory.shiftKmerWithNextCode(tmpKmer, succeedCode));
+
+ tmpKmer.set(outputKmerFactory.getFirstKmerFromChain(key.getKmerLength() - (KMER_SIZE - 1), key));
+ bitFlag = (byte) (bitFlag | 0x08);
+ outputValue.set(adjBitMap, bitFlag, tmpKmer);
+ output.collect(outputKmer, outputValue);
+ break;
+ case (byte) 0x80:
+ tmpKmer.set(outputKmerFactory.getFirstKmerFromChain(KMER_SIZE, key));
+ outputKmer.set(tmpKmer);
+ tmpKmer.set(outputKmerFactory.getLastKmerFromChain(key.getKmerLength() - KMER_SIZE, key));
+ bitFlag = (byte) (bitFlag | 0x10);
+ outputValue.set(adjBitMap, bitFlag, tmpKmer);
+ output.collect(outputKmer, outputValue);
+ break;
+ case (byte) 0x00:
+ succeedCode = GeneCode.getGeneCodeFromBitMap(succeed);
+ tmpKmer.set(outputKmerFactory.getLastKmerFromChain(KMER_SIZE, key));
+ outputKmer.set(outputKmerFactory.shiftKmerWithNextCode(tmpKmer, succeedCode));
+
+ tmpKmer.set(outputKmerFactory.getFirstKmerFromChain(key.getKmerLength() - (KMER_SIZE - 1), key));
+ bitFlag = (byte) (bitFlag | 0x08);
+ outputValue.set(adjBitMap, bitFlag, tmpKmer);
+ output.collect(outputKmer, outputValue);
+
+ bitFlag = (byte) (bitFlag & 0xF7);
+ tmpKmer.set(outputKmerFactory.getFirstKmerFromChain(KMER_SIZE, key));
+ outputKmer.set(tmpKmer);
+ tmpKmer.set(outputKmerFactory.getLastKmerFromChain(key.getKmerLength() - KMER_SIZE, key));
+ bitFlag = (byte) (bitFlag | 0x10);
+ outputValue.set(adjBitMap, bitFlag, tmpKmer);
+ output.collect(outputKmer, outputValue);
+ break;
+ case (byte) 0x81:
+ outputKmer.set(key);
+ outputValue.set(adjBitMap, bitFlag, null);
+ output.collect(outputKmer, outputValue);
+ break;
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Reducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Reducer.java
new file mode 100644
index 0000000..ad8b3c2
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathH2Reducer.java
@@ -0,0 +1,119 @@
+package edu.uci.ics.pathmergingh2;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.lib.MultipleOutputs;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+@SuppressWarnings("deprecation")
+public class MergePathH2Reducer extends MapReduceBase implements
+ Reducer<VKmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+ private VKmerBytesWritableFactory kmerFactory;
+ private VKmerBytesWritable outputKmer;
+ private VKmerBytesWritable tmpKmer1;
+ private VKmerBytesWritable tmpKmer2;
+ private int KMER_SIZE;
+ private MergePathValueWritable outputValue;
+ private MergePathValueWritable tmpOutputValue;
+
+ MultipleOutputs mos = null;
+ private int I_MERGE;
+
+ public void configure(JobConf job) {
+ mos = new MultipleOutputs(job);
+ I_MERGE = Integer.parseInt(job.get("iMerge"));
+ KMER_SIZE = job.getInt("sizeKmer", 0);
+ outputValue = new MergePathValueWritable();
+ tmpOutputValue = new MergePathValueWritable();
+ kmerFactory = new VKmerBytesWritableFactory(KMER_SIZE);
+ outputKmer = new VKmerBytesWritable(KMER_SIZE);
+ tmpKmer1 = new VKmerBytesWritable(KMER_SIZE);
+ tmpKmer2 = new VKmerBytesWritable(KMER_SIZE);
+ }
+
+ @SuppressWarnings("unchecked")
+ public void reduce(VKmerBytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ outputValue = values.next();
+ outputKmer.set(key);
+ if (values.hasNext() == true) {
+ byte bitFlag = outputValue.getFlag();
+ byte bitStartEnd = (byte) (0x81 & bitFlag);
+ byte bitPosiNegative = (byte) (0x18 & bitFlag);
+ byte succeed = (byte) 0x0F;
+ switch (bitPosiNegative) {
+ case (byte) 0x08:
+ if (outputValue.getKmerLength() != 0)
+ tmpKmer1.set(kmerFactory.mergeTwoKmer(outputValue.getKmer(), key));
+ else
+ tmpKmer1.set(key);
+ byte adjBitMap = outputValue.getAdjBitMap();
+ outputValue = values.next();
+ bitStartEnd = (byte) (0x81 & outputValue.getFlag());
+ if (bitStartEnd == (byte) 0x80) {
+ if (outputValue.getKmerLength() != 0)
+ tmpKmer2.set(kmerFactory.mergeTwoKmer(key, outputValue.getKmer()));
+ else
+ tmpKmer2.set(key);
+ byte tmpFlag = (byte) 0x80;
+ tmpOutputValue.set(outputValue.getAdjBitMap(), outputValue.getFlag(), null);
+ mos.getCollector("uncomplete" + I_MERGE, reporter).collect(tmpKmer2, tmpOutputValue);
+ }
+ if (outputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(tmpKmer1, outputValue.getKmer()));
+ else
+ outputKmer.set(tmpKmer1);
+ succeed = (byte) (succeed & outputValue.getAdjBitMap());
+ adjBitMap = (byte) (adjBitMap & 0xF0);
+ adjBitMap = (byte) (adjBitMap | succeed);
+ byte outputFlag = (byte) (0x81 & bitFlag);
+ outputFlag = (byte) (outputFlag | ((byte) 0x81 & outputValue.getFlag()));
+ outputValue.set(adjBitMap, outputFlag, null);
+ mos.getCollector("uncomplete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+ break;
+ case (byte) 0x10:
+ if (outputValue.getKmerLength() != 0)
+ tmpKmer1.set(kmerFactory.mergeTwoKmer(key, outputValue.getKmer()));
+ else
+ tmpKmer1.set(key);
+ if (bitStartEnd == (byte) 0x80) {
+ byte tmpFlag = (byte) 0x80;
+ tmpOutputValue.set(outputValue.getAdjBitMap(), tmpFlag, null);
+ mos.getCollector("uncomplete" + I_MERGE, reporter).collect(tmpKmer1, tmpOutputValue);
+ }
+ succeed = (byte) (succeed & outputValue.getAdjBitMap());
+ outputValue = values.next();
+ if (outputValue.getKmerLength() != 0)
+ outputKmer.set(kmerFactory.mergeTwoKmer(outputValue.getKmer(), tmpKmer1));
+ else
+ outputKmer.set(tmpKmer1);
+ adjBitMap = outputValue.getAdjBitMap();
+ adjBitMap = (byte) (adjBitMap & 0xF0);
+ adjBitMap = (byte) (adjBitMap | succeed);
+ outputFlag = (byte) (0x81 & bitFlag);
+ outputFlag = (byte) (outputFlag | ((byte) 0x81 & outputValue.getFlag()));
+ outputValue.set(adjBitMap, outputFlag, null);
+ mos.getCollector("uncomplete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+ break;
+ }
+ } else {
+ byte bitFlag = outputValue.getFlag();
+ byte bitStartEnd = (byte) (0x81 & bitFlag);
+ if (bitStartEnd == (byte) 0x81) {
+ outputKmer.set(key);
+ mos.getCollector("complete" + I_MERGE, reporter).collect(outputKmer, outputValue);
+ }
+ }
+ }
+ public void close() throws IOException {
+ // TODO Auto-generated method stub
+ mos.close();
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathMultiSeqOutputFormat.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathMultiSeqOutputFormat.java
new file mode 100644
index 0000000..5e6f008
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathMultiSeqOutputFormat.java
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmergingh2;
+
+import java.io.File;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
+
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+
+public class MergePathMultiSeqOutputFormat extends MultipleSequenceFileOutputFormat<VKmerBytesWritable, MergePathValueWritable>{
+ @Override
+ protected String generateLeafFileName(String name) {
+ // TODO Auto-generated method stub System.out.println(name);
+ String[] names = name.split("-");
+ return names[0] + File.separator + name;
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathMultiTextOutputFormat.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathMultiTextOutputFormat.java
new file mode 100644
index 0000000..d6176e2
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathMultiTextOutputFormat.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmergingh2;
+
+import java.io.File;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
+
+public class MergePathMultiTextOutputFormat extends MultipleTextOutputFormat<Text, Text>{
+ @Override
+ protected String generateLeafFileName(String name) {
+ // TODO Auto-generated method stub System.out.println(name);
+ String[] names = name.split("-");
+ return names[0] + File.separator + name;
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathValueWritable.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathValueWritable.java
new file mode 100644
index 0000000..2f1869d
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/MergePathValueWritable.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.pathmergingh2;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class MergePathValueWritable extends BinaryComparable implements WritableComparable<BinaryComparable> {
+
+ private static final byte[] EMPTY_BYTES = {};
+ private byte adjBitMap;
+ private byte flag;
+ private VKmerBytesWritable kmer;
+
+ public MergePathValueWritable() {
+ this((byte) 0, (byte) 0, 0, EMPTY_BYTES);
+ }
+
+ public MergePathValueWritable(int k) {
+ this.adjBitMap = 0;
+ this.flag = 0;
+ this.kmer = new VKmerBytesWritable(k);
+ }
+
+ public MergePathValueWritable(byte adjBitMap, byte flag, int kmerSize, byte[] bytes) {
+ this.adjBitMap = adjBitMap;
+ this.flag = flag;
+ this.kmer = new VKmerBytesWritable(kmerSize, bytes);
+ kmer.set(bytes, 0, bytes.length);
+ }
+
+ public void set(MergePathValueWritable right) {
+ set(right.getAdjBitMap(), right.getFlag(), right.getKmer());
+ }
+
+ public void set(byte adjBitMap, byte flag, VKmerBytesWritable kmer) {
+ this.kmer.set(kmer);
+ this.adjBitMap = adjBitMap;
+ this.flag = flag;
+ }
+
+ @Override
+ public void readFields(DataInput arg0) throws IOException {
+ // TODO Auto-generated method stub
+ kmer.readFields(arg0);
+ adjBitMap = arg0.readByte();
+ flag = arg0.readByte();
+ }
+
+ @Override
+ public void write(DataOutput arg0) throws IOException {
+ // TODO Auto-generated method stub
+
+ kmer.write(arg0);
+ arg0.writeByte(adjBitMap);
+ arg0.writeByte(flag);
+ }
+
+ public VKmerBytesWritable getKmer() {
+ if (kmer.getLength() != 0) {
+ return kmer;
+ }
+ return null;
+ }
+
+ public byte getAdjBitMap() {
+ return this.adjBitMap;
+ }
+
+ public byte getFlag() {
+ return this.flag;
+ }
+
+ public String toString() {
+ return GeneCode.getSymbolFromBitMap(adjBitMap) + '\t' + String.valueOf(flag);
+ }
+
+ @Override
+ public byte[] getBytes() {
+ // TODO Auto-generated method stub
+ if (kmer.getLength() != 0) {
+ return kmer.getBytes();
+ } else
+ return null;
+
+ }
+
+ public int getKmerLength() {
+ return kmer.getKmerLength();
+ }
+
+ @Override
+ public int getLength() {
+ return kmer.getLength();
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/SNodeInitialMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/SNodeInitialMapper.java
new file mode 100644
index 0000000..4c05dac
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/SNodeInitialMapper.java
@@ -0,0 +1,141 @@
+package edu.uci.ics.pathmergingh2;
+
+import java.io.IOException;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+@SuppressWarnings("deprecation")
+public class SNodeInitialMapper extends MapReduceBase implements
+ Mapper<KmerBytesWritable, ByteWritable, KmerBytesWritable, MergePathValueWritable> {
+
+ public int KMER_SIZE;
+ public KmerBytesWritable outputKmer;
+ public MergePathValueWritable outputAdjList;
+
+ public void configure(JobConf job) {
+ KMER_SIZE = Integer.parseInt(job.get("sizeKmer"));
+ outputKmer = new KmerBytesWritable(KMER_SIZE);
+ outputAdjList = new MergePathValueWritable();
+ }
+
+ boolean measureDegree(byte adjacent) {
+ boolean result = true;
+ switch (adjacent) {
+ case 0:
+ result = true;
+ break;
+ case 1:
+ result = false;
+ break;
+ case 2:
+ result = false;
+ break;
+ case 3:
+ result = true;
+ break;
+ case 4:
+ result = false;
+ break;
+ case 5:
+ result = true;
+ break;
+ case 6:
+ result = true;
+ break;
+ case 7:
+ result = true;
+ break;
+ case 8:
+ result = false;
+ break;
+ case 9:
+ result = true;
+ break;
+ case 10:
+ result = true;
+ break;
+ case 11:
+ result = true;
+ break;
+ case 12:
+ result = true;
+ break;
+ case 13:
+ result = true;
+ break;
+ case 14:
+ result = true;
+ break;
+ case 15:
+ result = true;
+ break;
+ }
+ return result;
+ }
+
+ @Override
+ public void map(KmerBytesWritable key, ByteWritable value,
+ OutputCollector<KmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ byte precursor = (byte) 0xF0;
+ byte succeed = (byte) 0x0F;
+ byte adjBitMap = value.get();
+ byte bitFlag = (byte) 0;
+ precursor = (byte) (precursor & adjBitMap);
+ precursor = (byte) ((precursor & 0xff) >> 4);
+ succeed = (byte) (succeed & adjBitMap);
+ boolean inDegree = measureDegree(precursor);
+ boolean outDegree = measureDegree(succeed);
+ if (key.toString().equals("CGC")) {
+ int a = 2;
+ int b = a;
+ }
+ if (key.toString().equals("TCG")) {
+ int a = 2;
+ int b = a;
+ }
+ if (inDegree == false && outDegree == false) {
+ outputKmer.set(key);
+ System.out.println(outputKmer.hashCode());
+ bitFlag = (byte) 2;
+ outputAdjList.set(adjBitMap, bitFlag, null);
+ output.collect(outputKmer, outputAdjList);
+ } else {
+ for (int i = 0; i < 4; i++) {
+ byte temp = (byte) 0x01;
+ byte shiftedCode = 0;
+ temp = (byte) (temp << i);
+ temp = (byte) (precursor & temp);
+ if (temp != 0) {
+ byte precurCode = GeneCode.getGeneCodeFromBitMap(temp);
+ shiftedCode = key.shiftKmerWithPreCode(precurCode);
+ outputKmer.set(key);
+ bitFlag = (byte) 0x80;
+ outputAdjList.set((byte) 0, bitFlag, null);
+ output.collect(outputKmer, outputAdjList);
+ key.shiftKmerWithNextCode(shiftedCode);
+ }
+ }
+ for (int i = 0; i < 4; i++) {
+ byte temp = (byte) 0x01;
+ byte shiftedCode = 0;
+ temp = (byte) (temp << i);
+ temp = (byte) (succeed & temp);
+ if (temp != 0) {
+ byte succeedCode = GeneCode.getGeneCodeFromBitMap(temp);
+ shiftedCode = key.shiftKmerWithNextCode(succeedCode);
+ outputKmer.set(key);
+ bitFlag = (byte) 0x01;
+ outputAdjList.set((byte) 0, bitFlag, null);
+ output.collect(outputKmer, outputAdjList);
+ key.shiftKmerWithPreCode(shiftedCode);
+ }
+ }
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/SNodeInitialReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/SNodeInitialReducer.java
new file mode 100644
index 0000000..7fd7a2e
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/pathmergingh2/SNodeInitialReducer.java
@@ -0,0 +1,80 @@
+package edu.uci.ics.pathmergingh2;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+@SuppressWarnings("deprecation")
+public class SNodeInitialReducer extends MapReduceBase implements
+ Reducer<KmerBytesWritable, MergePathValueWritable, VKmerBytesWritable, MergePathValueWritable> {
+ private VKmerBytesWritable outputKmer = new VKmerBytesWritable();
+ private MergePathValueWritable outputValue = new MergePathValueWritable();
+
+ @Override
+ public void reduce(KmerBytesWritable key, Iterator<MergePathValueWritable> values,
+ OutputCollector<VKmerBytesWritable, MergePathValueWritable> output, Reporter reporter) throws IOException {
+ outputKmer.set(key);
+ outputValue = values.next();
+ byte startFlag = 0x00;
+ byte endFlag = 0x00;
+ byte targetPointFlag = 0x00;
+ byte targetAdjList = 0x00;
+ byte outputFlag = 0x00;
+ if(key.toString().equals("TCG")){
+ int a = 2;
+ int b = a;
+ }
+ if (values.hasNext() == true) {
+ switch (outputValue.getFlag()) {
+ case (byte) 0x01:
+ startFlag = (byte) 0x01;
+ break;
+ case (byte) 0x80:
+ endFlag = (byte) 0x80;
+ break;
+ case (byte) 0x02:
+ targetPointFlag = (byte) 0x02;
+ targetAdjList = outputValue.getAdjBitMap();
+ break;
+ }
+ while (values.hasNext()) {
+ outputValue = values.next();
+ switch (outputValue.getFlag()) {
+ case (byte) 0x01:
+ startFlag = (byte) 0x01;
+ break;
+ case (byte) 0x80:
+ endFlag = (byte) 0x80;
+ break;
+ case (byte) 0x02:
+ targetPointFlag = (byte) 0x02;
+ targetAdjList = outputValue.getAdjBitMap();
+ break;
+ }
+ if(startFlag != (byte) 0x00 && endFlag!= (byte) 0x00 && targetPointFlag != (byte) 0x00)
+ break;
+ }
+ if(targetPointFlag == (byte) 0x02) {
+ if(startFlag == (byte) 0x01) {
+ outputFlag = (byte) (outputFlag | startFlag);
+ }
+ if(endFlag == (byte) 0x80) {
+ outputFlag = (byte) (outputFlag | endFlag);
+ }
+ outputValue.set(targetAdjList, outputFlag, null);
+ output.collect(outputKmer, outputValue);
+ }
+ } else {
+ if (outputValue.getFlag() == 2) {
+ byte bitFlag = 0;
+ outputValue.set(outputValue.getAdjBitMap(), bitFlag, null);
+ output.collect(outputKmer, outputValue);
+ }
+ }
+ }
+}
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatDriver.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatDriver.java
new file mode 100644
index 0000000..efe1589
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatDriver.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.statistics;
+
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+@SuppressWarnings("deprecation")
+public class GenomixStatDriver {
+ private static class Options {
+ @Option(name = "-inputpath", usage = "the input path", required = true)
+ public String inputPath;
+
+ @Option(name = "-outputpath", usage = "the output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-num-reducers", usage = "the number of reducers", required = true)
+ public int numReducers;
+
+ }
+
+ public void run(String inputPath, String outputPath, int numReducers, String defaultConfPath)
+ throws IOException {
+
+ JobConf conf = new JobConf(GenomixStatDriver.class);
+
+ if (defaultConfPath != null) {
+ conf.addResource(new Path(defaultConfPath));
+ }
+
+ conf.setJobName("Genomix Statistics");
+ conf.setMapperClass(GenomixStatMapper.class);
+ conf.setReducerClass(GenomixStatReducer.class);
+ conf.setCombinerClass(GenomixStatReducer.class);
+
+ conf.setMapOutputKeyClass(BytesWritable.class);
+ conf.setMapOutputValueClass(KmerCountValue.class);
+
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+
+ conf.setOutputKeyClass(BytesWritable.class);
+ conf.setOutputValueClass(KmerCountValue.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(inputPath));
+ FileOutputFormat.setOutputPath(conf, new Path(outputPath));
+ conf.setNumReduceTasks(numReducers);
+
+ FileSystem dfs = FileSystem.get(conf);
+ dfs.delete(new Path(outputPath), true);
+ JobClient.runJob(conf);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+ GenomixStatDriver driver = new GenomixStatDriver();
+ driver.run(options.inputPath, options.outputPath, options.numReducers, null);
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatMapper.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatMapper.java
new file mode 100644
index 0000000..c5feefe
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatMapper.java
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.statistics;
+
+import java.io.IOException;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+@SuppressWarnings({ "unused", "deprecation" })
+public class GenomixStatMapper extends MapReduceBase implements
+ Mapper<BytesWritable, KmerCountValue, BytesWritable, KmerCountValue> {
+
+ boolean measureDegree(byte adjacent) {
+ boolean result = true;
+ switch (adjacent) {
+ case 0:
+ result = true;
+ break;
+ case 1:
+ result = false;
+ break;
+ case 2:
+ result = false;
+ break;
+ case 3:
+ result = true;
+ break;
+ case 4:
+ result = false;
+ break;
+ case 5:
+ result = true;
+ break;
+ case 6:
+ result = true;
+ break;
+ case 7:
+ result = true;
+ break;
+ case 8:
+ result = false;
+ break;
+ case 9:
+ result = true;
+ break;
+ case 10:
+ result = true;
+ break;
+ case 11:
+ result = true;
+ break;
+ case 12:
+ result = true;
+ break;
+ case 13:
+ result = true;
+ break;
+ case 14:
+ result = true;
+ break;
+ case 15:
+ result = true;
+ break;
+ }
+ return result;
+ }
+ @Override
+ public void map(BytesWritable key, KmerCountValue value, OutputCollector<BytesWritable, KmerCountValue> output,
+ Reporter reporter) throws IOException {
+ byte precursor = (byte) 0xF0;
+ byte succeed = (byte) 0x0F;
+ byte adj = value.getAdjBitMap();
+ precursor = (byte) (precursor & adj);
+ precursor = (byte) ((precursor & 0xff) >> 4);
+ succeed = (byte) (succeed & adj);
+ boolean inDegree = measureDegree(precursor);
+ boolean outDegree = measureDegree(succeed);
+ if (inDegree == true && outDegree == false) {
+ output.collect(key, value);
+ }
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatReducer.java b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatReducer.java
new file mode 100644
index 0000000..ea9a915
--- /dev/null
+++ b/genomix/genomix-hadoop/src/main/java/edu/uci/ics/statistics/GenomixStatReducer.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.statistics;
+
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+@SuppressWarnings("deprecation")
+public class GenomixStatReducer extends MapReduceBase implements
+ Reducer<BytesWritable, KmerCountValue, BytesWritable, KmerCountValue> {
+ static enum MyCounters { NUM_RECORDS };
+ KmerCountValue valWriter = new KmerCountValue();
+ @Override
+ public void reduce(BytesWritable key, Iterator<KmerCountValue> values,
+ OutputCollector<BytesWritable, KmerCountValue> output, Reporter reporter) throws IOException {
+ reporter.incrCounter(MyCounters.NUM_RECORDS, 1);
+ valWriter = values.next();
+ output.collect(key, valWriter);
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/src/test/.DS_Store b/genomix/genomix-hadoop/src/test/.DS_Store
new file mode 100644
index 0000000..bfe14e8
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/test/java/.DS_Store b/genomix/genomix-hadoop/src/test/java/.DS_Store
new file mode 100644
index 0000000..fb3684c
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/test/java/edu/.DS_Store b/genomix/genomix-hadoop/src/test/java/edu/.DS_Store
new file mode 100644
index 0000000..f50e64b
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/.DS_Store b/genomix/genomix-hadoop/src/test/java/edu/uci/.DS_Store
new file mode 100644
index 0000000..9aea623
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/.DS_Store b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/.DS_Store
new file mode 100644
index 0000000..64f18c4
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/.DS_Store
Binary files differ
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/gbresultschecking/ResultsCheckingTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/gbresultschecking/ResultsCheckingTest.java
new file mode 100644
index 0000000..72e9b45
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/gbresultschecking/ResultsCheckingTest.java
@@ -0,0 +1,76 @@
+package edu.uci.ics.gbresultschecking;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MiniMRCluster;
+import org.junit.Test;
+
+@SuppressWarnings("deprecation")
+public class ResultsCheckingTest {
+ private static final String ACTUAL_RESULT_DIR = "actual4";
+ private JobConf conf = new JobConf();
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private static final String DATA_PATH1 = "ResultsCheckingData" + "/part-00000";
+ private static final String DATA_PATH2 = "ResultsCheckingData" + "/part-00001";
+ private static final String HDFS_PATH1 = "/webmap1";
+ private static final String HDFS_PATH2 = "/webmap2";
+ private static final String RESULT_PATH = "/result4";
+ private static final int COUNT_REDUCER = 4;
+ private static final int SIZE_KMER = 3;
+ private MiniDFSCluster dfsCluster;
+ private MiniMRCluster mrCluster;
+ private FileSystem dfs;
+
+ @Test
+ public void test() throws Exception {
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHadoop();
+ ResultsCheckingDriver tldriver = new ResultsCheckingDriver();
+ tldriver.run(HDFS_PATH1, HDFS_PATH2, RESULT_PATH, COUNT_REDUCER, SIZE_KMER, HADOOP_CONF_PATH);
+ dumpResult();
+ cleanupHadoop();
+
+ }
+ private void startHadoop() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, 2, true, null);
+ dfs = dfsCluster.getFileSystem();
+ mrCluster = new MiniMRCluster(4, dfs.getUri().toString(), 2);
+
+ Path src = new Path(DATA_PATH1);
+ Path dest = new Path(HDFS_PATH1 + "/");
+ dfs.mkdirs(dest);
+ dfs.copyFromLocalFile(src, dest);
+ src = new Path(DATA_PATH2);
+ dest = new Path(HDFS_PATH2 + "/");
+ dfs.copyFromLocalFile(src, dest);
+
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ private void cleanupHadoop() throws IOException {
+ mrCluster.shutdown();
+ dfsCluster.shutdown();
+ }
+
+ private void dumpResult() throws IOException {
+ Path src = new Path(RESULT_PATH);
+ Path dest = new Path(ACTUAL_RESULT_DIR + "/");
+ dfs.copyToLocalFile(src, dest);
+ }
+}
+
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
new file mode 100755
index 0000000..97b861c
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphbuilding/GraphBuildingTest.java
@@ -0,0 +1,144 @@
+package edu.uci.ics.graphbuilding;
+
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MiniMRCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.utils.TestUtils;
+/**
+ * This class test the correctness of graphbuilding program
+ */
+@SuppressWarnings("deprecation")
+public class GraphBuildingTest {
+
+ private static final String ACTUAL_RESULT_DIR = "actual1";
+ private static final String COMPARE_DIR = "compare";
+ private JobConf conf = new JobConf();
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private static final String DATA_PATH = "data/webmap/BridgePath";
+ private static final String HDFS_PATH = "/webmap";
+ private static final String RESULT_PATH = "/result1";
+ private static final String EXPECTED_PATH = "expected/result1";
+ private static final String TEST_SOURCE_DIR = COMPARE_DIR + RESULT_PATH + "/comparesource.txt";
+ private static final int COUNT_REDUCER = 4;
+ private static final int SIZE_KMER = 5;
+ private static final String GRAPHVIZ = "Graphviz/GenomixSource.txt";
+
+ private MiniDFSCluster dfsCluster;
+ private MiniMRCluster mrCluster;
+ private FileSystem dfs;
+
+ @SuppressWarnings("resource")
+ @Test
+ public void test() throws Exception {
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHadoop();
+
+ // run graph transformation tests
+ GenomixDriver tldriver = new GenomixDriver();
+ tldriver.run(HDFS_PATH, RESULT_PATH, COUNT_REDUCER, SIZE_KMER, HADOOP_CONF_PATH);
+
+ SequenceFile.Reader reader = null;
+ Path path = new Path(RESULT_PATH + "/part-00000");
+ reader = new SequenceFile.Reader(dfs, path, conf);
+// KmerBytesWritable key = (KmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ KmerBytesWritable key = new KmerBytesWritable(SIZE_KMER);
+ KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+ File filePathTo = new File(TEST_SOURCE_DIR);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+ File GraphViz = new File(GRAPHVIZ);
+ BufferedWriter bw2 = new BufferedWriter(new FileWriter(GraphViz));
+
+ while (reader.next(key, value)) {
+ byte succeed = (byte) 0x0F;
+ byte adjBitMap = value.getAdjBitMap();
+ succeed = (byte) (succeed & adjBitMap);
+ byte shiftedCode = 0;
+ for(int i = 0 ; i < 4; i ++){
+ byte temp = 0x01;
+ temp = (byte)(temp << i);
+ temp = (byte) (succeed & temp);
+ if(temp != 0 ){
+ bw2.write(key.toString());
+ bw2.newLine();
+ byte succeedCode = GeneCode.getGeneCodeFromBitMap(temp);
+ shiftedCode = key.shiftKmerWithNextCode(succeedCode);
+ bw2.write(key.toString());
+ bw2.newLine();
+ key.shiftKmerWithPreCode(shiftedCode);
+ }
+ }
+ bw.write(key.toString() + "\t" + value.toString());
+ bw.newLine();
+ }
+ bw.close();
+
+ dumpResult();
+// TestUtils.compareWithResult(new File(TEST_SOURCE_DIR), new File(EXPECTED_PATH));
+
+ cleanupHadoop();
+
+ }
+
+ private void startHadoop() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, 2, true, null);
+ dfs = dfsCluster.getFileSystem();
+ mrCluster = new MiniMRCluster(4, dfs.getUri().toString(), 2);
+
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_PATH + "/");
+ dfs.mkdirs(dest);
+ dfs.copyFromLocalFile(src, dest);
+
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ private void cleanupHadoop() throws IOException {
+ mrCluster.shutdown();
+ dfsCluster.shutdown();
+ }
+
+ private void dumpResult() throws IOException {
+ Path src = new Path(RESULT_PATH);
+ Path dest = new Path(ACTUAL_RESULT_DIR);
+ dfs.copyToLocalFile(src, dest);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphcountfilter/CountFilterTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphcountfilter/CountFilterTest.java
new file mode 100644
index 0000000..c09937c
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/graphcountfilter/CountFilterTest.java
@@ -0,0 +1,101 @@
+package edu.uci.ics.graphcountfilter;
+
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MiniMRCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.Test;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.utils.TestUtils;
+
+
+@SuppressWarnings("deprecation")
+public class CountFilterTest {
+ private static final String ACTUAL_RESULT_DIR = "actual2";
+ private static final String COMPARE_DIR = "compare";
+ private JobConf conf = new JobConf();
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private static final String DATA_PATH = "actual1" + "/result1" + "/part-00000";
+ private static final String HDFS_PATH = "/webmap";
+ private static final String RESULT_PATH = "/result2";
+ private static final String EXPECTED_PATH = "expected/result2";
+ private static final String TEST_SOURCE_DIR = COMPARE_DIR + RESULT_PATH + "/comparesource.txt";
+ private static final int COUNT_REDUCER = 4;
+ private static final int SIZE_KMER = 3;
+ private MiniDFSCluster dfsCluster;
+ private MiniMRCluster mrCluster;
+ private FileSystem dfs;
+
+ @SuppressWarnings("resource")
+ @Test
+ public void test() throws Exception {
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHadoop();
+
+ // run graph transformation tests
+ CountFilterDriver tldriver = new CountFilterDriver();
+ tldriver.run(HDFS_PATH, RESULT_PATH, COUNT_REDUCER, 1, HADOOP_CONF_PATH);
+
+ SequenceFile.Reader reader = null;
+ Path path = new Path(RESULT_PATH + "/part-00000");
+ reader = new SequenceFile.Reader(dfs, path, conf);
+ KmerBytesWritable key = (KmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ ByteWritable value = (ByteWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+ File filePathTo = new File(TEST_SOURCE_DIR);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+ while (reader.next(key, value)) {
+ bw.write(key.toString() + "\t" + value.toString());
+ bw.newLine();
+ }
+ bw.close();
+
+ dumpResult();
+ TestUtils.compareWithResult(new File(TEST_SOURCE_DIR), new File(EXPECTED_PATH));
+
+ cleanupHadoop();
+
+ }
+ private void startHadoop() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, 2, true, null);
+ dfs = dfsCluster.getFileSystem();
+ mrCluster = new MiniMRCluster(4, dfs.getUri().toString(), 2);
+
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_PATH + "/");
+ dfs.mkdirs(dest);
+ dfs.copyFromLocalFile(src, dest);
+
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ private void cleanupHadoop() throws IOException {
+ mrCluster.shutdown();
+ dfsCluster.shutdown();
+ }
+
+ private void dumpResult() throws IOException {
+ Path src = new Path(RESULT_PATH);
+ Path dest = new Path(ACTUAL_RESULT_DIR + "/");
+ dfs.copyToLocalFile(src, dest);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmerging/MergePathTest.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmerging/MergePathTest.java
new file mode 100644
index 0000000..5e6e51e
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmerging/MergePathTest.java
@@ -0,0 +1,106 @@
+package edu.uci.ics.pathmerging;
+
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MiniMRCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.utils.TestUtils;
+
+@SuppressWarnings("deprecation")
+public class MergePathTest {
+ private static final String ACTUAL_RESULT_DIR = "actual3";
+ private static final String COMPARE_DIR = "compare";
+ private JobConf conf = new JobConf();
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private static final String DATA_PATH = "actual2" + "/result2" + "/part-00000";
+ private static final String HDFS_PATH = "/webmap";
+ private static final String HDFA_PATH_DATA = "/webmapdata";
+
+ private static final String RESULT_PATH = "/result3";
+ private static final String EXPECTED_PATH = "expected/result3";
+ private static final String TEST_SOURCE_DIR = COMPARE_DIR + RESULT_PATH + "/comparesource.txt";
+ private static final int COUNT_REDUCER = 1;
+ private static final int SIZE_KMER = 3;
+
+ private MiniDFSCluster dfsCluster;
+ private MiniMRCluster mrCluster;
+ private FileSystem dfs;
+
+ @SuppressWarnings("resource")
+ @Test
+ public void test() throws Exception {
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHadoop();
+
+ MergePathDriver tldriver = new MergePathDriver();
+ tldriver.run(HDFS_PATH, RESULT_PATH, HDFA_PATH_DATA, COUNT_REDUCER, SIZE_KMER, 1, HADOOP_CONF_PATH);
+
+/* SequenceFile.Reader reader = null;
+ Path path = new Path(RESULT_PATH + "/part-00000");
+// Path path = new Path(RESULT_PATH + "/uncomplete0" + "/uncomplete0-r-00000");
+ reader = new SequenceFile.Reader(dfs, path, conf);
+ VKmerBytesWritable key = (VKmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ MergePathValueWritable value = (MergePathValueWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+ File filePathTo = new File(TEST_SOURCE_DIR);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+ while (reader.next(key, value)) {
+ bw.write(key.toString() + "\t" + value.getAdjBitMap() + "\t" + value.getFlag());
+ bw.newLine();
+ }
+ bw.close();*/
+ dumpResult();
+
+// TestUtils.compareWithResult(new File(TEST_SOURCE_DIR), new File(EXPECTED_PATH));
+
+ cleanupHadoop();
+
+ }
+ private void startHadoop() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, 2, true, null);
+ dfs = dfsCluster.getFileSystem();
+ mrCluster = new MiniMRCluster(4, dfs.getUri().toString(), 2);
+
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_PATH + "/");
+ dfs.mkdirs(dest);
+ dfs.copyFromLocalFile(src, dest);
+ Path data = new Path(HDFA_PATH_DATA + "/");
+ dfs.mkdirs(data);
+
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ private void cleanupHadoop() throws IOException {
+ mrCluster.shutdown();
+ dfsCluster.shutdown();
+ }
+
+ private void dumpResult() throws IOException {
+// Path src = new Path(HDFA_PATH_DATA + "/" + "complete2");
+ Path src = new Path(RESULT_PATH);
+ Path dest = new Path(ACTUAL_RESULT_DIR + "/");
+ dfs.copyToLocalFile(src, dest);
+ }
+}
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmergingh2/MergePathH2Test.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmergingh2/MergePathH2Test.java
new file mode 100644
index 0000000..ff15299
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/pathmergingh2/MergePathH2Test.java
@@ -0,0 +1,105 @@
+package edu.uci.ics.pathmergingh2;
+
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MiniMRCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.Test;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.utils.TestUtils;
+
+@SuppressWarnings("deprecation")
+public class MergePathH2Test {
+ private static final String ACTUAL_RESULT_DIR = "actual4";
+ private static final String COMPARE_DIR = "compare";
+ private JobConf conf = new JobConf();
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private static final String DATA_PATH = "actual2" + "/result2" + "/part-00000";
+ private static final String HDFS_PATH = "/webmap";
+ private static final String HDFA_PATH_DATA = "/webmapdata";
+
+ private static final String RESULT_PATH = "/result4";
+ private static final String EXPECTED_PATH = "expected/result4";
+ private static final String TEST_SOURCE_DIR = COMPARE_DIR + RESULT_PATH + "/comparesource.txt";
+ private static final int COUNT_REDUCER = 1;
+ private static final int SIZE_KMER = 3;
+
+ private MiniDFSCluster dfsCluster;
+ private MiniMRCluster mrCluster;
+ private FileSystem dfs;
+
+ @SuppressWarnings("resource")
+ @Test
+ public void test() throws Exception {
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHadoop();
+
+ MergePathH2Driver tldriver = new MergePathH2Driver();
+ tldriver.run(HDFS_PATH, RESULT_PATH, HDFA_PATH_DATA, COUNT_REDUCER, SIZE_KMER, 1, HADOOP_CONF_PATH);
+
+/* SequenceFile.Reader reader = null;
+// Path path = new Path(RESULT_PATH + "/part-00000");
+ Path path = new Path(RESULT_PATH + "/uncomplete0" + "/uncomplete0-r-00000");
+ reader = new SequenceFile.Reader(dfs, path, conf);
+ VKmerBytesWritable key = (VKmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ MergePathValueWritable value = (MergePathValueWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+ File filePathTo = new File(TEST_SOURCE_DIR);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+ while (reader.next(key, value)) {
+ bw.write(key.toString() + "\t" + value.getAdjBitMap() + "\t" + value.getFlag());
+ bw.newLine();
+ }
+ bw.close();*/
+// dumpResult();
+
+// TestUtils.compareWithResult(new File(TEST_SOURCE_DIR), new File(EXPECTED_PATH));
+
+ cleanupHadoop();
+
+ }
+ private void startHadoop() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, 2, true, null);
+ dfs = dfsCluster.getFileSystem();
+ mrCluster = new MiniMRCluster(4, dfs.getUri().toString(), 2);
+
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_PATH + "/");
+ dfs.mkdirs(dest);
+ dfs.copyFromLocalFile(src, dest);
+ Path data = new Path(HDFA_PATH_DATA + "/");
+ dfs.mkdirs(data);
+
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ private void cleanupHadoop() throws IOException {
+ mrCluster.shutdown();
+ dfsCluster.shutdown();
+ }
+
+ private void dumpResult() throws IOException {
+// Path src = new Path(HDFA_PATH_DATA + "/" + "complete2");
+ Path src = new Path(RESULT_PATH);
+ Path dest = new Path(ACTUAL_RESULT_DIR + "/");
+ dfs.copyToLocalFile(src, dest);
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-hadoop/src/test/java/edu/uci/ics/utils/TestUtils.java b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/utils/TestUtils.java
new file mode 100755
index 0000000..015017a
--- /dev/null
+++ b/genomix/genomix-hadoop/src/test/java/edu/uci/ics/utils/TestUtils.java
@@ -0,0 +1,75 @@
+package edu.uci.ics.utils;
+
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+
+/**
+ * This class offer the service for graphbuildingtest.class
+ */
+public class TestUtils {
+ public static void compareWithResult(File expectedFile, File actualFile) throws Exception {
+ BufferedReader readerExpected = new BufferedReader(new FileReader(expectedFile));
+ BufferedReader readerActual = new BufferedReader(new FileReader(actualFile));
+ String lineExpected, lineActual;
+ int num = 1;
+ try {
+ while ((lineExpected = readerExpected.readLine()) != null) {
+ lineActual = readerActual.readLine();
+ // Assert.assertEquals(lineExpected, lineActual);
+ if (lineActual == null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< " + lineExpected + "\n> ");
+ }
+ if (!equalStrings(lineExpected, lineActual)) {
+ throw new Exception("Result for changed at line " + num + ":\n< " + lineExpected + "\n> "
+ + lineActual);
+ }
+ ++num;
+ }
+ lineActual = readerActual.readLine();
+ if (lineActual != null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< \n> " + lineActual);
+ }
+ } finally {
+ readerExpected.close();
+ readerActual.close();
+ }
+ }
+
+ private static boolean equalStrings(String s1, String s2) {
+ String[] rowsOne = s1.split("\t");
+ String[] rowsTwo = s2.split("\t");
+
+ if (rowsOne.length != rowsTwo.length)
+ return false;
+
+ for (int i = 0; i < rowsOne.length; i++) {
+ String row1 = rowsOne[i];
+ String row2 = rowsTwo[i];
+
+ if (row1.equals(row2))
+ continue;
+ else
+ return false;
+ }
+ return true;
+ }
+
+ public static void main(String[] args) throws Exception {
+ TestUtils TUtils = new TestUtils();
+ }
+}
diff --git a/genomix/genomix-hyracks/HyracksCodeFormatProfile.xml b/genomix/genomix-hyracks/HyracksCodeFormatProfile.xml
new file mode 100644
index 0000000..733ca5c
--- /dev/null
+++ b/genomix/genomix-hyracks/HyracksCodeFormatProfile.xml
@@ -0,0 +1,784 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<profiles version="11">
+ <profile kind="CodeFormatterProfile" name="HyracksCodeFormatProfile"
+ version="11">
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.disabling_tag"
+ value="@formatter:off" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.blank_lines_before_field"
+ value="0" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.use_on_off_tags"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line"
+ value="false" />
+ <setting id="org.eclipse.jdt.core.formatter.insert_space_after_ellipsis"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_multiple_fields"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_conditional_expression"
+ value="80" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_binary_operator"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_array_initializer"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.blank_lines_after_package"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.continuation_indentation"
+ value="2" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_binary_operator"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.blank_lines_before_package"
+ value="0" />
+ <setting id="org.eclipse.jdt.core.compiler.source" value="1.5" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.comment.format_line_comments"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.join_wrapped_lines"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.blank_lines_before_member_type"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.align_type_members_on_columns"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_unary_operator"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.indent_parameter_description"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment"
+ value="false" />
+ <setting id="org.eclipse.jdt.core.formatter.lineSplit" value="120" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration"
+ value="0" />
+ <setting id="org.eclipse.jdt.core.formatter.indentation.size"
+ value="4" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.enabling_tag"
+ value="@formatter:on" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration"
+ value="16" />
+ <setting id="org.eclipse.jdt.core.formatter.alignment_for_assignment"
+ value="0" />
+ <setting id="org.eclipse.jdt.core.compiler.problem.assertIdentifier"
+ value="error" />
+ <setting id="org.eclipse.jdt.core.formatter.tabulation.char"
+ value="space" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_body"
+ value="true" />
+ <setting id="org.eclipse.jdt.core.formatter.blank_lines_before_method"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_method_declaration"
+ value="0" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.brace_position_for_switch"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.compiler.problem.enumIdentifier"
+ value="error" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch"
+ value="true" />
+ <setting id="org.eclipse.jdt.core.formatter.insert_space_before_ellipsis"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.brace_position_for_block"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_method_declaration"
+ value="end_of_line" />
+ <setting id="org.eclipse.jdt.core.formatter.compact_else_if"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_enum_constant"
+ value="end_of_line" />
+ <setting id="org.eclipse.jdt.core.formatter.comment.indent_root_tags"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.tabulation.size"
+ value="4" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.indent_empty_lines"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_block_in_case"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression"
+ value="16" />
+ <setting id="org.eclipse.jdt.core.compiler.compliance" value="1.5" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer"
+ value="2" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_unary_operator"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_binary_expression"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode"
+ value="enabled" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line"
+ value="false" />
+ <setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_label"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant"
+ value="48" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.format_javadoc_comments"
+ value="true" />
+ <setting id="org.eclipse.jdt.core.formatter.comment.line_length"
+ value="9999" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.blank_lines_between_import_groups"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body"
+ value="0" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.wrap_before_binary_operator"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_block"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration"
+ value="16" />
+ <setting id="org.eclipse.jdt.core.formatter.join_lines_in_comments"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.alignment_for_compact_if"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.blank_lines_before_imports"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.comment.format_html"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.comment.format_source_code"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration"
+ value="16" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.compiler.codegen.targetPlatform"
+ value="1.5" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation"
+ value="0" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_member"
+ value="insert" />
+ <setting id="org.eclipse.jdt.core.formatter.comment.format_header"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.format_block_comments"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant"
+ value="do not insert" />
+ <setting id="org.eclipse.jdt.core.formatter.alignment_for_enum_constants"
+ value="49" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.brace_position_for_type_declaration"
+ value="end_of_line" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries"
+ value="true" />
+ <setting id="org.eclipse.jdt.core.formatter.blank_lines_after_imports"
+ value="1" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header"
+ value="true" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for"
+ value="insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments"
+ value="do not insert" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column"
+ value="false" />
+ <setting
+ id="org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line"
+ value="false" />
+ </profile>
+</profiles>
diff --git a/genomix/genomix-hyracks/pom.xml b/genomix/genomix-hyracks/pom.xml
new file mode 100644
index 0000000..ca5cb61
--- /dev/null
+++ b/genomix/genomix-hyracks/pom.xml
@@ -0,0 +1,265 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <artifactId>genomix-hyracks</artifactId>
+ <name>genomix-hyracks</name>
+
+ <parent>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>genomix</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ </parent>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.0.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ <fork>true</fork>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>appassembler-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <configuration>
+ <programs>
+ <program>
+ <mainClass>edu.uci.ics.genomix.driver.Driver</mainClass>
+ <name>genomix</name>
+ </program>
+ <program>
+ <mainClass>edu.uci.ics.hyracks.control.cc.CCDriver</mainClass>
+ <name>genomixcc</name>
+ </program>
+ <program>
+ <mainClass>edu.uci.ics.hyracks.control.nc.NCDriver</mainClass>
+ <name>genomixnc</name>
+ </program>
+ </programs>
+ <repositoryLayout>flat</repositoryLayout>
+ <repositoryName>lib</repositoryName>
+ </configuration>
+ <phase>package</phase>
+ <goals>
+ <goal>assemble</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>2.2-beta-5</version>
+ <configuration>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ <executions>
+ <execution>
+ <configuration>
+ <descriptors>
+ <descriptor>src/main/assembly/binary-assembly.xml</descriptor>
+ </descriptors>
+ </configuration>
+ <phase>package</phase>
+ <goals>
+ <goal>attached</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>make-my-jar-with-dependencies</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.7.2</version>
+ <configuration>
+ <forkMode>pertest</forkMode>
+ <argLine>-enableassertions -Xmx512m -XX:MaxPermSize=300m
+ -Dfile.encoding=UTF-8
+ -Djava.util.logging.config.file=src/test/resources/logging.properties</argLine>
+ <includes>
+ <include>**/*TestSuite.java</include>
+ <include>**/*Test.java</include>
+ </includes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-clean-plugin</artifactId>
+ <configuration>
+ <filesets>
+ <fileset>
+ <directory>.</directory>
+ <includes>
+ <include>teststore*</include>
+ <include>edu*</include>
+ <include>actual*</include>
+ <include>build*</include>
+ <include>expect*</include>
+ <include>ClusterController*</include>
+ </includes>
+ </fileset>
+ </filesets>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>2.5</version>
+ <executions>
+ <execution>
+ <id>copy-scripts</id>
+ <!-- here the phase you need -->
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>target/appassembler/bin</outputDirectory>
+ <resources>
+ <resource>
+ <directory>src/main/resources/scripts</directory>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ <execution>
+ <id>copy-conf</id>
+ <!-- here the phase you need -->
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>target/appassembler/conf</outputDirectory>
+ <resources>
+ <resource>
+ <directory>src/main/resources/conf</directory>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>2.6</version>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-dataflow-std</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-api</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-dataflow-common</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-data-std</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-control-cc</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-control-nc</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.kenai.nbpwr</groupId>
+ <artifactId>org-apache-commons-io</artifactId>
+ <version>1.3.1-201002241208</version>
+ <type>nbm</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks.examples</groupId>
+ <artifactId>hyracks-integration-tests</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-ipc</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-core</artifactId>
+ <version>0.20.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-test</artifactId>
+ <version>0.20.2</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-hdfs-core</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-hdfs-core</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>genomix-data</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/genomix/genomix-hyracks/src/main/assembly/binary-assembly.xml b/genomix/genomix-hyracks/src/main/assembly/binary-assembly.xml
new file mode 100644
index 0000000..68d424a
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/assembly/binary-assembly.xml
@@ -0,0 +1,19 @@
+<assembly>
+ <id>binary-assembly</id>
+ <formats>
+ <format>zip</format>
+ <format>dir</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <fileSets>
+ <fileSet>
+ <directory>target/appassembler/bin</directory>
+ <outputDirectory>bin</outputDirectory>
+ <fileMode>0755</fileMode>
+ </fileSet>
+ <fileSet>
+ <directory>target/appassembler/lib</directory>
+ <outputDirectory>lib</outputDirectory>
+ </fileSet>
+ </fileSets>
+</assembly>
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/ByteSerializerDeserializer.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/ByteSerializerDeserializer.java
new file mode 100644
index 0000000..ebf1282
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/ByteSerializerDeserializer.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.std.accessors;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+
+public class ByteSerializerDeserializer implements ISerializerDeserializer<Byte> {
+
+ private static final long serialVersionUID = 1L;
+
+ public static final ByteSerializerDeserializer INSTANCE = new ByteSerializerDeserializer();
+
+ private ByteSerializerDeserializer() {
+ }
+
+ @Override
+ public Byte deserialize(DataInput in) throws HyracksDataException {
+ try {
+ return in.readByte();
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ @Override
+ public void serialize(Byte instance, DataOutput out) throws HyracksDataException {
+ try {
+ out.writeByte(instance.intValue());
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ public static byte getByte(byte[] bytes, int offset) {
+ return bytes[offset];
+ }
+
+ public static void putByte(byte val, byte[] bytes, int offset) {
+ bytes[offset] = val;
+ }
+
+}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerBinaryHashFunctionFamily.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerBinaryHashFunctionFamily.java
new file mode 100644
index 0000000..34c29c7
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerBinaryHashFunctionFamily.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.std.accessors;
+
+import edu.uci.ics.genomix.data.std.primitive.KmerPointable;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFamily;
+
+public class KmerBinaryHashFunctionFamily implements IBinaryHashFunctionFamily {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public IBinaryHashFunction createBinaryHashFunction(final int seed) {
+
+ return new IBinaryHashFunction() {
+ private KmerPointable p = new KmerPointable();
+
+ @Override
+ public int hash(byte[] bytes, int offset, int length) {
+ if (length + offset >= bytes.length)
+ throw new IllegalStateException("out of bound");
+ p.set(bytes, offset, length);
+ int hash = p.hash() * (seed + 1);
+ if (hash < 0) {
+ hash = -(hash + 1);
+ }
+ return hash;
+ }
+ };
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerHashPartitioncomputerFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerHashPartitioncomputerFactory.java
new file mode 100644
index 0000000..8aaf380
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerHashPartitioncomputerFactory.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.std.accessors;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputerFactory;
+
+public class KmerHashPartitioncomputerFactory implements ITuplePartitionComputerFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public static int hashBytes(byte[] bytes, int offset, int length) {
+ int hash = 1;
+ for (int i = offset; i < offset + length; i++)
+ hash = (31 * hash) + (int) bytes[i];
+ return hash;
+ }
+
+ @Override
+ public ITuplePartitionComputer createPartitioner() {
+ return new ITuplePartitionComputer() {
+ @Override
+ public int partition(IFrameTupleAccessor accessor, int tIndex, int nParts) {
+ int startOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldOffset = accessor.getFieldStartOffset(tIndex, 0);
+ int slotLength = accessor.getFieldSlotsLength();
+ int fieldLength = accessor.getFieldLength(tIndex, 0);
+
+ ByteBuffer buf = accessor.getBuffer();
+
+ int hash = hashBytes(buf.array(), startOffset + fieldOffset + slotLength, fieldLength);
+ if (hash < 0) {
+ hash = -(hash + 1);
+ }
+
+ return hash % nParts;
+ }
+ };
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerNormarlizedComputerFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerNormarlizedComputerFactory.java
new file mode 100644
index 0000000..83b9d12
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/accessors/KmerNormarlizedComputerFactory.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.std.accessors;
+
+import edu.uci.ics.genomix.data.std.primitive.KmerPointable;
+import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputerFactory;
+
+public class KmerNormarlizedComputerFactory implements INormalizedKeyComputerFactory {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public INormalizedKeyComputer createNormalizedKeyComputer() {
+ return new INormalizedKeyComputer() {
+ /**
+ * read one int from Kmer, make sure this int is consistent whith Kmer compartor
+ */
+ @Override
+ public int normalize(byte[] bytes, int start, int length) {
+ return KmerPointable.getIntReverse(bytes, start, length);
+ }
+ };
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/primitive/KmerPointable.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/primitive/KmerPointable.java
new file mode 100644
index 0000000..8febfa5
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/data/std/primitive/KmerPointable.java
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.data.std.primitive;
+
+import edu.uci.ics.genomix.data.std.accessors.KmerHashPartitioncomputerFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTraits;
+import edu.uci.ics.hyracks.data.std.api.AbstractPointable;
+import edu.uci.ics.hyracks.data.std.api.IComparable;
+import edu.uci.ics.hyracks.data.std.api.IHashable;
+import edu.uci.ics.hyracks.data.std.api.INumeric;
+import edu.uci.ics.hyracks.data.std.api.IPointable;
+import edu.uci.ics.hyracks.data.std.api.IPointableFactory;
+
+public final class KmerPointable extends AbstractPointable implements IHashable, IComparable, INumeric {
+ public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public boolean isFixedLength() {
+ return false;
+ }
+
+ @Override
+ public int getFixedLength() {
+ return -1;
+ }
+ };
+
+ public static final IPointableFactory FACTORY = new IPointableFactory() {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public IPointable createPointable() {
+ return new KmerPointable();
+ }
+
+ @Override
+ public ITypeTraits getTypeTraits() {
+ return TYPE_TRAITS;
+ }
+ };
+
+ public static short getShortReverse(byte[] bytes, int offset, int length) {
+ if (length < 2) {
+ return (short) (bytes[offset] & 0xff);
+ }
+ return (short) (((bytes[offset + length - 1] & 0xff) << 8) + (bytes[offset + length - 2] & 0xff));
+ }
+
+ public static int getIntReverse(byte[] bytes, int offset, int length) {
+ int shortValue = getShortReverse(bytes, offset, length) & 0xffff;
+
+ if (length < 3) {
+ return shortValue;
+ }
+ if (length == 3) {
+ return (((bytes[offset + 2] & 0xff) << 16) + ((bytes[offset + 1] & 0xff) << 8) + ((bytes[offset] & 0xff)));
+ }
+ return ((bytes[offset + length - 1] & 0xff) << 24) + ((bytes[offset + length - 2] & 0xff) << 16)
+ + ((bytes[offset + length - 3] & 0xff) << 8) + ((bytes[offset + length - 4] & 0xff) << 0);
+ }
+
+ public static long getLongReverse(byte[] bytes, int offset, int length) {
+ if (length < 8) {
+ return ((long) getIntReverse(bytes, offset, length)) & 0x0ffffffffL;
+ }
+ return (((long) (bytes[offset + length - 1] & 0xff)) << 56)
+ + (((long) (bytes[offset + length - 2] & 0xff)) << 48)
+ + (((long) (bytes[offset + length - 3] & 0xff)) << 40)
+ + (((long) (bytes[offset + length - 4] & 0xff)) << 32)
+ + (((long) (bytes[offset + length - 5] & 0xff)) << 24)
+ + (((long) (bytes[offset + length - 6] & 0xff)) << 16)
+ + (((long) (bytes[offset + length - 7] & 0xff)) << 8) + (((long) (bytes[offset + length - 8] & 0xff)));
+ }
+
+ @Override
+ public int compareTo(IPointable pointer) {
+ return compareTo(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
+ }
+
+ @Override
+ public int compareTo(byte[] bytes, int offset, int length) {
+
+ if (this.length != length) {
+ return this.length - length;
+ }
+ for (int i = length - 1; i >= 0; i--) {
+ int cmp = (this.bytes[this.start + i] & 0xff) - (bytes[offset + i] & 0xff);
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
+
+ return 0;
+ }
+
+ @Override
+ public int hash() {
+ int hash = KmerHashPartitioncomputerFactory.hashBytes(bytes, start, length);
+ return hash;
+ }
+
+ @Override
+ public byte byteValue() {
+ return bytes[start + length - 1];
+ }
+
+ @Override
+ public short shortValue() {
+ return getShortReverse(bytes, start, length);
+ }
+
+ @Override
+ public int intValue() {
+ return getIntReverse(bytes, start, length);
+ }
+
+ @Override
+ public long longValue() {
+ return getLongReverse(bytes, start, length);
+ }
+
+ @Override
+ public float floatValue() {
+ return Float.intBitsToFloat(intValue());
+ }
+
+ @Override
+ public double doubleValue() {
+ return Double.longBitsToDouble(longValue());
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ConnectorPolicyAssignmentPolicy.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ConnectorPolicyAssignmentPolicy.java
new file mode 100644
index 0000000..ed1b926
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ConnectorPolicyAssignmentPolicy.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.dataflow;
+
+import edu.uci.ics.hyracks.api.dataflow.IConnectorDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.connectors.IConnectorPolicy;
+import edu.uci.ics.hyracks.api.dataflow.connectors.IConnectorPolicyAssignmentPolicy;
+import edu.uci.ics.hyracks.api.dataflow.connectors.PipeliningConnectorPolicy;
+import edu.uci.ics.hyracks.api.dataflow.connectors.SendSideMaterializedPipeliningConnectorPolicy;
+import edu.uci.ics.hyracks.dataflow.std.connectors.MToNPartitioningMergingConnectorDescriptor;
+
+/**
+ * used by precluster groupby
+ */
+public class ConnectorPolicyAssignmentPolicy implements IConnectorPolicyAssignmentPolicy {
+ private static final long serialVersionUID = 1L;
+ private IConnectorPolicy senderSideMaterializePolicy = new SendSideMaterializedPipeliningConnectorPolicy();
+ private IConnectorPolicy pipeliningPolicy = new PipeliningConnectorPolicy();
+
+ @Override
+ public IConnectorPolicy getConnectorPolicyAssignment(IConnectorDescriptor c, int nProducers, int nConsumers,
+ int[] fanouts) {
+ if (c instanceof MToNPartitioningMergingConnectorDescriptor) {
+ return senderSideMaterializePolicy;
+ } else {
+ return pipeliningPolicy;
+ }
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
new file mode 100644
index 0000000..405e109
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerSequenceWriterFactory.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.dataflow;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Writer;
+import org.apache.hadoop.mapred.JobConf;
+
+import edu.uci.ics.genomix.job.GenomixJob;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriter;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriterFactory;
+import edu.uci.ics.hyracks.hdfs.dataflow.ConfFactory;
+
+@SuppressWarnings("deprecation")
+public class KMerSequenceWriterFactory implements ITupleWriterFactory {
+
+ private static final long serialVersionUID = 1L;
+ private ConfFactory confFactory;
+ private final int kmerlength;
+
+ public KMerSequenceWriterFactory(JobConf conf) throws HyracksDataException {
+ this.confFactory = new ConfFactory(conf);
+ this.kmerlength = conf.getInt(GenomixJob.KMER_LENGTH, GenomixJob.DEFAULT_KMER);
+ }
+
+ public class TupleWriter implements ITupleWriter {
+ public TupleWriter(ConfFactory cf) {
+ this.cf = cf;
+ }
+
+ ConfFactory cf;
+ Writer writer = null;
+
+ KmerCountValue reEnterCount = new KmerCountValue();
+ KmerBytesWritable reEnterKey = new KmerBytesWritable(kmerlength);
+
+ /**
+ * assumption is that output never change source!
+ */
+ @Override
+ public void write(DataOutput output, ITupleReference tuple) throws HyracksDataException {
+ try {
+ byte[] kmer = tuple.getFieldData(0);
+ int keyStart = tuple.getFieldStart(0);
+ int keyLength = tuple.getFieldLength(0);
+
+ byte bitmap = tuple.getFieldData(1)[tuple.getFieldStart(1)];
+ byte count = tuple.getFieldData(2)[tuple.getFieldStart(2)];
+ reEnterCount.set(bitmap, count);
+ reEnterKey.set(kmer, keyStart, keyLength);
+ writer.append(reEnterKey, reEnterCount);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ @Override
+ public void open(DataOutput output) throws HyracksDataException {
+ try {
+ writer = SequenceFile.createWriter(cf.getConf(), (FSDataOutputStream) output, KmerBytesWritable.class,
+ KmerCountValue.class, CompressionType.NONE, null);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ @Override
+ public void close(DataOutput output) throws HyracksDataException {
+ // TODO Auto-generated method stub
+ }
+ }
+
+ @Override
+ public ITupleWriter getTupleWriter(IHyracksTaskContext ctx) throws HyracksDataException {
+ return new TupleWriter(confFactory);
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
new file mode 100644
index 0000000..cfa7262
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/KMerTextWriterFactory.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.dataflow;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriter;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriterFactory;
+
+public class KMerTextWriterFactory implements ITupleWriterFactory {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ private KmerBytesWritable kmer;
+
+ public KMerTextWriterFactory(int k) {
+ kmer = new KmerBytesWritable(k);
+ }
+
+ public class TupleWriter implements ITupleWriter {
+ @Override
+ public void write(DataOutput output, ITupleReference tuple) throws HyracksDataException {
+ try {
+ kmer.set(tuple.getFieldData(0), tuple.getFieldStart(0), tuple.getFieldLength(0));
+ output.write(kmer.toString().getBytes());
+ output.writeByte('\t');
+ output.write(GeneCode.getSymbolFromBitMap(tuple.getFieldData(1)[tuple.getFieldStart(1)]).getBytes());
+ output.writeByte('\t');
+ output.write(String.valueOf((int) tuple.getFieldData(2)[tuple.getFieldStart(2)]).getBytes());
+ output.writeByte('\n');
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ @Override
+ public void open(DataOutput output) throws HyracksDataException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void close(DataOutput output) throws HyracksDataException {
+ // TODO Auto-generated method stub
+ }
+ }
+
+ @Override
+ public ITupleWriter getTupleWriter(IHyracksTaskContext ctx) throws HyracksDataException {
+ // TODO Auto-generated method stub
+ return new TupleWriter();
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
new file mode 100644
index 0000000..409b434
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/ReadsKeyValueParserFactory.java
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.dataflow;
+
+import java.nio.ByteBuffer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+
+import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.hyracks.api.comm.IFrameWriter;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.hdfs.api.IKeyValueParser;
+import edu.uci.ics.hyracks.hdfs.api.IKeyValueParserFactory;
+
+;
+
+public class ReadsKeyValueParserFactory implements IKeyValueParserFactory<LongWritable, Text> {
+ private static final long serialVersionUID = 1L;
+
+ private KmerBytesWritable kmer;
+ private boolean bReversed;
+
+ public ReadsKeyValueParserFactory(int k, boolean bGenerateReversed) {
+ bReversed = bGenerateReversed;
+ kmer = new KmerBytesWritable(k);
+ }
+
+ @Override
+ public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) {
+ final ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(2);
+ final ByteBuffer outputBuffer = ctx.allocateFrame();
+ final FrameTupleAppender outputAppender = new FrameTupleAppender(ctx.getFrameSize());
+ outputAppender.reset(outputBuffer, true);
+
+ return new IKeyValueParser<LongWritable, Text>() {
+
+ @Override
+ public void parse(LongWritable key, Text value, IFrameWriter writer) throws HyracksDataException {
+ String geneLine = value.toString(); // Read the Real Gene Line
+ Pattern genePattern = Pattern.compile("[AGCT]+");
+ Matcher geneMatcher = genePattern.matcher(geneLine);
+ boolean isValid = geneMatcher.matches();
+ if (isValid) {
+ SplitReads(geneLine.getBytes(), writer);
+ }
+ }
+
+ private void SplitReads(byte[] array, IFrameWriter writer) {
+ /** first kmer */
+ int k = kmer.getKmerLength();
+ if (k >= array.length){
+ return;
+ }
+ kmer.setByRead(array, 0);
+ byte pre = 0;
+ byte next = GeneCode.getAdjBit(array[k]);
+ InsertToFrame(kmer, pre, next, writer);
+
+ /** middle kmer */
+ for (int i = k; i < array.length - 1; i++) {
+ pre = GeneCode.getBitMapFromGeneCode(kmer.shiftKmerWithNextChar(array[i]));
+ next = GeneCode.getAdjBit(array[i + 1]);
+ InsertToFrame(kmer, pre, next, writer);
+ }
+
+ /** last kmer */
+ pre = GeneCode.getBitMapFromGeneCode(kmer.shiftKmerWithNextChar(array[array.length - 1]));
+ next = 0;
+ InsertToFrame(kmer, pre, next, writer);
+
+ if (bReversed) {
+ /** first kmer */
+ kmer.setByReadReverse(array, 0);
+ next = 0;
+ pre = GeneCode.getAdjBit(array[k]);
+ InsertToFrame(kmer, pre, next, writer);
+ /** middle kmer */
+ for (int i = k; i < array.length - 1; i++) {
+ next = GeneCode.getBitMapFromGeneCode(kmer.shiftKmerWithPreChar(array[i]));
+ pre = GeneCode.getAdjBit(array[i + 1]);
+ InsertToFrame(kmer, pre, next, writer);
+ }
+ /** last kmer */
+ next = GeneCode.getBitMapFromGeneCode(kmer.shiftKmerWithPreChar(array[array.length - 1]));
+ pre = 0;
+ InsertToFrame(kmer, pre, next, writer);
+ }
+ }
+
+ /**
+ * At this graph building phase, we assume the kmer length are all
+ * the same Thus we didn't output those Kmer length
+ *
+ * @param kmer
+ * :input kmer
+ * @param pre
+ * : pre neighbor code
+ * @param next
+ * : next neighbor code
+ * @param writer
+ * : output writer
+ */
+ private void InsertToFrame(KmerBytesWritable kmer, byte pre, byte next, IFrameWriter writer) {
+ try {
+ byte adj = GeneCode.mergePreNextAdj(pre, next);
+ tupleBuilder.reset();
+ tupleBuilder.addField(kmer.getBytes(), 0, kmer.getLength());
+ tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE, adj);
+
+ if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
+ tupleBuilder.getSize())) {
+ FrameUtils.flushFrame(outputBuffer, writer);
+ outputAppender.reset(outputBuffer, true);
+ if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
+ tupleBuilder.getSize())) {
+ throw new IllegalStateException(
+ "Failed to copy an record into a frame: the record size is too large.");
+ }
+ }
+ } catch (Exception e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ @Override
+ public void open(IFrameWriter writer) throws HyracksDataException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void close(IFrameWriter writer) throws HyracksDataException {
+ FrameUtils.flushFrame(outputBuffer, writer);
+ }
+ };
+ }
+
+}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/DistributedMergeLmerAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/DistributedMergeLmerAggregateFactory.java
new file mode 100644
index 0000000..ea70fb0
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/DistributedMergeLmerAggregateFactory.java
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.dataflow.aggregators;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+
+/**
+ * sum
+ */
+public class DistributedMergeLmerAggregateFactory implements IAggregatorDescriptorFactory {
+ private static final long serialVersionUID = 1L;
+
+ public DistributedMergeLmerAggregateFactory() {
+ }
+
+ public class DistributeAggregatorDescriptor implements IAggregatorDescriptor {
+ private static final int MAX = 127;
+
+ @Override
+ public void reset() {
+ }
+
+ @Override
+ public void close() {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public AggregateState createAggregateStates() {
+ return new AggregateState(new Object() {
+ });
+ }
+
+ protected byte getField(IFrameTupleAccessor accessor, int tIndex, int fieldId) {
+ int tupleOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldStart = accessor.getFieldStartOffset(tIndex, fieldId);
+ int offset = tupleOffset + fieldStart + accessor.getFieldSlotsLength();
+ byte data = ByteSerializerDeserializer.getByte(accessor.getBuffer().array(), offset);
+ return data;
+ }
+
+ @Override
+ public void init(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex, AggregateState state)
+ throws HyracksDataException {
+ byte bitmap = getField(accessor, tIndex, 1);
+ byte count = getField(accessor, tIndex, 2);
+
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeByte(bitmap);
+ tupleBuilder.addFieldEndOffset();
+ fieldOutput.writeByte(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when initializing the aggregator.");
+ }
+ }
+
+ @Override
+ public void aggregate(IFrameTupleAccessor accessor, int tIndex, IFrameTupleAccessor stateAccessor,
+ int stateTupleIndex, AggregateState state) throws HyracksDataException {
+ byte bitmap = getField(accessor, tIndex, 1);
+ short count = getField(accessor, tIndex, 2);
+
+ int statetupleOffset = stateAccessor.getTupleStartOffset(stateTupleIndex);
+ int bitfieldStart = stateAccessor.getFieldStartOffset(stateTupleIndex, 1);
+ int countfieldStart = stateAccessor.getFieldStartOffset(stateTupleIndex, 2);
+ int bitoffset = statetupleOffset + stateAccessor.getFieldSlotsLength() + bitfieldStart;
+ int countoffset = statetupleOffset + stateAccessor.getFieldSlotsLength() + countfieldStart;
+
+ byte[] data = stateAccessor.getBuffer().array();
+
+ bitmap |= data[bitoffset];
+ count += data[countoffset];
+ if (count >= MAX) {
+ count = (byte) MAX;
+ }
+ data[bitoffset] = bitmap;
+ data[countoffset] = (byte) count;
+ }
+
+ @Override
+ public void outputPartialResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ byte bitmap = getField(accessor, tIndex, 1);
+ byte count = getField(accessor, tIndex, 2);
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeByte(bitmap);
+ tupleBuilder.addFieldEndOffset();
+ fieldOutput.writeByte(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when writing aggregation to the output buffer.");
+ }
+
+ }
+
+ @Override
+ public void outputFinalResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ outputPartialResult(tupleBuilder, accessor, tIndex, state);
+ }
+
+ }
+
+ @Override
+ public IAggregatorDescriptor createAggregator(IHyracksTaskContext ctx, RecordDescriptor inRecordDescriptor,
+ RecordDescriptor outRecordDescriptor, int[] keyFields, int[] keyFieldsInPartialResults)
+ throws HyracksDataException {
+ return new DistributeAggregatorDescriptor();
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/LocalAggregatorDescriptor.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/LocalAggregatorDescriptor.java
new file mode 100644
index 0000000..87c0207
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/LocalAggregatorDescriptor.java
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.dataflow.aggregators;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+
+public class LocalAggregatorDescriptor implements IAggregatorDescriptor {
+ private static final int MAX = 127;
+
+ @Override
+ public void reset() {
+ }
+
+ @Override
+ public void close() {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public AggregateState createAggregateStates() {
+ return new AggregateState(new Object() {
+ });
+ }
+
+ protected byte getField(IFrameTupleAccessor accessor, int tIndex, int fieldId) {
+ int tupleOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldStart = accessor.getFieldStartOffset(tIndex, fieldId);
+ int offset = tupleOffset + fieldStart + accessor.getFieldSlotsLength();
+ byte data = ByteSerializerDeserializer.getByte(accessor.getBuffer().array(), offset);
+ return data;
+ }
+
+ @Override
+ public void init(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex, AggregateState state)
+ throws HyracksDataException {
+ byte bitmap = getField(accessor, tIndex, 1);
+ byte count = 1;
+
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeByte(bitmap);
+ tupleBuilder.addFieldEndOffset();
+ fieldOutput.writeByte(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when initializing the aggregator.");
+ }
+ }
+
+ @Override
+ public void aggregate(IFrameTupleAccessor accessor, int tIndex, IFrameTupleAccessor stateAccessor,
+ int stateTupleIndex, AggregateState state) throws HyracksDataException {
+ byte bitmap = getField(accessor, tIndex, 1);
+ short count = 1;
+
+ int statetupleOffset = stateAccessor.getTupleStartOffset(stateTupleIndex);
+ int bitfieldStart = stateAccessor.getFieldStartOffset(stateTupleIndex, 1);
+ int countfieldStart = stateAccessor.getFieldStartOffset(stateTupleIndex, 2);
+ int bitoffset = statetupleOffset + stateAccessor.getFieldSlotsLength() + bitfieldStart;
+ int countoffset = statetupleOffset + stateAccessor.getFieldSlotsLength() + countfieldStart;
+
+ byte[] data = stateAccessor.getBuffer().array();
+
+ bitmap |= data[bitoffset];
+ count += data[countoffset];
+ if (count >= MAX) {
+ count = (byte) MAX;
+ }
+ data[bitoffset] = bitmap;
+ data[countoffset] = (byte) count;
+ }
+
+ @Override
+ public void outputPartialResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ byte bitmap = getField(accessor, tIndex, 1);
+ byte count = getField(accessor, tIndex, 2);
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeByte(bitmap);
+ tupleBuilder.addFieldEndOffset();
+ fieldOutput.writeByte(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when writing aggregation to the output buffer.");
+ }
+
+ }
+
+ @Override
+ public void outputFinalResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ outputPartialResult(tupleBuilder, accessor, tIndex, state);
+ }
+
+};
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/MergeKmerAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/MergeKmerAggregateFactory.java
new file mode 100644
index 0000000..b5eb70f
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/dataflow/aggregators/MergeKmerAggregateFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.dataflow.aggregators;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+
+/**
+ * count
+ */
+public class MergeKmerAggregateFactory implements IAggregatorDescriptorFactory {
+ private static final long serialVersionUID = 1L;
+
+ public MergeKmerAggregateFactory() {
+ }
+
+ @Override
+ public IAggregatorDescriptor createAggregator(IHyracksTaskContext ctx, RecordDescriptor inRecordDescriptor,
+ RecordDescriptor outRecordDescriptor, int[] keyFields, int[] keyFieldsInPartialResults)
+ throws HyracksDataException {
+ return new LocalAggregatorDescriptor();
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/driver/Driver.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/driver/Driver.java
new file mode 100644
index 0000000..27d5e15
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/driver/Driver.java
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.driver;
+
+import java.net.URL;
+import java.util.EnumSet;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.GenericOptionsParser;
+
+import edu.uci.ics.genomix.job.GenomixJob;
+import edu.uci.ics.genomix.job.JobGen;
+import edu.uci.ics.genomix.job.JobGenBrujinGraph;
+import edu.uci.ics.hyracks.api.client.HyracksConnection;
+import edu.uci.ics.hyracks.api.client.IHyracksClientConnection;
+import edu.uci.ics.hyracks.api.client.NodeControllerInfo;
+import edu.uci.ics.hyracks.api.exceptions.HyracksException;
+import edu.uci.ics.hyracks.api.job.JobFlag;
+import edu.uci.ics.hyracks.api.job.JobId;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.hdfs.scheduler.Scheduler;
+
+public class Driver {
+ public static enum Plan {
+ BUILD_DEBRUJIN_GRAPH,
+ GRAPH_CLEANNING,
+ CONTIGS_GENERATION,
+ }
+
+ private static final String IS_PROFILING = "genomix.driver.profiling";
+ private static final String CPARTITION_PER_MACHINE = "genomix.driver.duplicate.num";
+ private static final Log LOG = LogFactory.getLog(Driver.class);
+ private JobGen jobGen;
+ private boolean profiling;
+
+ private int numPartitionPerMachine;
+
+ private IHyracksClientConnection hcc;
+ private Scheduler scheduler;
+
+ public Driver(String ipAddress, int port, int numPartitionPerMachine) throws HyracksException {
+ try {
+ hcc = new HyracksConnection(ipAddress, port);
+ scheduler = new Scheduler(hcc.getNodeControllerInfos());
+ } catch (Exception e) {
+ throw new HyracksException(e);
+ }
+ this.numPartitionPerMachine = numPartitionPerMachine;
+ }
+
+ public void runJob(GenomixJob job) throws HyracksException {
+ runJob(job, Plan.BUILD_DEBRUJIN_GRAPH, false);
+ }
+
+ public void runJob(GenomixJob job, Plan planChoice, boolean profiling) throws HyracksException {
+ /** add hadoop configurations */
+ URL hadoopCore = job.getClass().getClassLoader().getResource("core-site.xml");
+ job.addResource(hadoopCore);
+ URL hadoopMapRed = job.getClass().getClassLoader().getResource("mapred-site.xml");
+ job.addResource(hadoopMapRed);
+ URL hadoopHdfs = job.getClass().getClassLoader().getResource("hdfs-site.xml");
+ job.addResource(hadoopHdfs);
+
+ LOG.info("job started");
+ long start = System.currentTimeMillis();
+ long end = start;
+ long time = 0;
+
+ this.profiling = profiling;
+ try {
+ Map<String, NodeControllerInfo> ncMap = hcc.getNodeControllerInfos();
+ LOG.info("ncmap:" + ncMap.size() + " " + ncMap.keySet().toString());
+ switch (planChoice) {
+ case BUILD_DEBRUJIN_GRAPH:
+ default:
+ jobGen = new JobGenBrujinGraph(job, scheduler, ncMap, numPartitionPerMachine);
+ break;
+ }
+
+ start = System.currentTimeMillis();
+ runCreate(jobGen);
+ end = System.currentTimeMillis();
+ time = end - start;
+ LOG.info("result writing finished " + time + "ms");
+ LOG.info("job finished");
+ } catch (Exception e) {
+ throw new HyracksException(e);
+ }
+ }
+
+ private void runCreate(JobGen jobGen) throws Exception {
+ try {
+ JobSpecification createJob = jobGen.generateJob();
+ execute(createJob);
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw e;
+ }
+ }
+
+ private void execute(JobSpecification job) throws Exception {
+ job.setUseConnectorPolicyForScheduling(false);
+ JobId jobId = hcc
+ .startJob(job, profiling ? EnumSet.of(JobFlag.PROFILE_RUNTIME) : EnumSet.noneOf(JobFlag.class));
+ hcc.waitForCompletion(jobId);
+ }
+
+ public static void main(String[] args) throws Exception {
+ GenomixJob jobConf = new GenomixJob();
+ String[] otherArgs = new GenericOptionsParser(jobConf, args).getRemainingArgs();
+ if (otherArgs.length < 4) {
+ System.err.println("Need <serverIP> <port> <input> <output>");
+ System.exit(-1);
+ }
+ String ipAddress = otherArgs[0];
+ int port = Integer.parseInt(otherArgs[1]);
+ int numOfDuplicate = jobConf.getInt(CPARTITION_PER_MACHINE, 2);
+ boolean bProfiling = jobConf.getBoolean(IS_PROFILING, true);
+ // FileInputFormat.setInputPaths(job, otherArgs[2]);
+ {
+ Path path = new Path(jobConf.getWorkingDirectory(), otherArgs[2]);
+ jobConf.set("mapred.input.dir", path.toString());
+
+ Path outputDir = new Path(jobConf.getWorkingDirectory(), otherArgs[3]);
+ jobConf.set("mapred.output.dir", outputDir.toString());
+ }
+ // FileInputFormat.addInputPath(jobConf, new Path(otherArgs[2]));
+ // FileOutputFormat.setOutputPath(job, new Path(otherArgs[3]));
+ Driver driver = new Driver(ipAddress, port, numOfDuplicate);
+ driver.runJob(jobConf, Plan.BUILD_DEBRUJIN_GRAPH, bProfiling);
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
new file mode 100644
index 0000000..be82477
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/GenomixJob.java
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.job;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+
+public class GenomixJob extends JobConf {
+
+ public static final String JOB_NAME = "genomix";
+
+ /** Kmers length */
+ public static final String KMER_LENGTH = "genomix.kmer";
+ /** Frame Size */
+ public static final String FRAME_SIZE = "genomix.framesize";
+ /** Frame Limit, hyracks need */
+ public static final String FRAME_LIMIT = "genomix.framelimit";
+ /** Table Size, hyracks need */
+ public static final String TABLE_SIZE = "genomix.tablesize";
+ /** Groupby types */
+ public static final String GROUPBY_TYPE = "genomix.graph.groupby.type";
+ /** Graph outputformat */
+ public static final String OUTPUT_FORMAT = "genomix.graph.output";
+ /** Get reversed Kmer Sequence */
+ public static final String REVERSED_KMER = "genomix.kmer.reversed";
+
+ /** Configurations used by hybrid groupby function in graph build phrase */
+ public static final String GROUPBY_HYBRID_INPUTSIZE = "genomix.graph.groupby.hybrid.inputsize";
+ public static final String GROUPBY_HYBRID_INPUTKEYS = "genomix.graph.groupby.hybrid.inputkeys";
+ public static final String GROUPBY_HYBRID_RECORDSIZE_SINGLE = "genomix.graph.groupby.hybrid.recordsize.single";
+ public static final String GROUPBY_HYBRID_RECORDSIZE_CROSS = "genomix.graph.groupby.hybrid.recordsize.cross";
+ public static final String GROUPBY_HYBRID_HASHLEVEL = "genomix.graph.groupby.hybrid.hashlevel";
+
+ public static final int DEFAULT_KMER = 21;
+ public static final int DEFAULT_FRAME_SIZE = 32768;
+ public static final int DEFAULT_FRAME_LIMIT = 4096;
+ public static final int DEFAULT_TABLE_SIZE = 10485767;
+ public static final long DEFAULT_GROUPBY_HYBRID_INPUTSIZE = 154000000L;
+ public static final long DEFAULT_GROUPBY_HYBRID_INPUTKEYS = 38500000L;
+ public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_SINGLE = 9;
+ public static final int DEFAULT_GROUPBY_HYBRID_HASHLEVEL = 1;
+ public static final int DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS = 13;
+
+ public static final boolean DEFAULT_REVERSED = false;
+
+ public static final String DEFAULT_GROUPBY_TYPE = "hybrid";
+ public static final String DEFAULT_OUTPUT_FORMAT = "binary";
+
+ public GenomixJob() throws IOException {
+ super(new Configuration());
+ }
+
+ public GenomixJob(Configuration conf) throws IOException {
+ super(conf);
+ }
+
+ /**
+ * Set the kmer length
+ *
+ * @param the
+ * desired frame size
+ */
+ final public void setKmerLength(int kmerlength) {
+ setInt(KMER_LENGTH, kmerlength);
+ }
+
+ final public void setFrameSize(int frameSize) {
+ setInt(FRAME_SIZE, frameSize);
+ }
+
+ final public void setFrameLimit(int frameLimit) {
+ setInt(FRAME_LIMIT, frameLimit);
+ }
+
+ final public void setTableSize(int tableSize) {
+ setInt(TABLE_SIZE, tableSize);
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGen.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGen.java
new file mode 100644
index 0000000..b1f9a29
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGen.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.job;
+
+import java.util.UUID;
+
+import org.apache.hadoop.conf.Configuration;
+
+import edu.uci.ics.hyracks.api.exceptions.HyracksException;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+
+public abstract class JobGen {
+
+ protected final Configuration conf;
+ protected final GenomixJob genomixJob;
+ protected String jobId = new UUID(System.currentTimeMillis(), System.nanoTime()).toString();
+
+ public JobGen(GenomixJob job) {
+ this.conf = job;
+ this.genomixJob = job;
+ this.initJobConfiguration();
+ }
+
+ protected abstract void initJobConfiguration();
+
+ public abstract JobSpecification generateJob() throws HyracksException;
+
+}
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
new file mode 100644
index 0000000..79b38e8
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenBrujinGraph.java
@@ -0,0 +1,295 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.job;
+
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+
+import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
+import edu.uci.ics.genomix.data.std.accessors.KmerBinaryHashFunctionFamily;
+import edu.uci.ics.genomix.data.std.accessors.KmerHashPartitioncomputerFactory;
+import edu.uci.ics.genomix.data.std.accessors.KmerNormarlizedComputerFactory;
+import edu.uci.ics.genomix.data.std.primitive.KmerPointable;
+import edu.uci.ics.genomix.dataflow.ConnectorPolicyAssignmentPolicy;
+import edu.uci.ics.genomix.dataflow.KMerSequenceWriterFactory;
+import edu.uci.ics.genomix.dataflow.KMerTextWriterFactory;
+import edu.uci.ics.genomix.dataflow.ReadsKeyValueParserFactory;
+import edu.uci.ics.genomix.dataflow.aggregators.DistributedMergeLmerAggregateFactory;
+import edu.uci.ics.genomix.dataflow.aggregators.MergeKmerAggregateFactory;
+import edu.uci.ics.hyracks.api.client.NodeControllerInfo;
+import edu.uci.ics.hyracks.api.constraints.PartitionConstraintHelper;
+import edu.uci.ics.hyracks.api.dataflow.IConnectorDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFamily;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.exceptions.HyracksException;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.data.std.accessors.PointableBinaryComparatorFactory;
+import edu.uci.ics.hyracks.data.std.accessors.PointableBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFactory;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.connectors.MToNPartitioningConnectorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.connectors.MToNPartitioningMergingConnectorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.HashSpillableTableFactory;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+import edu.uci.ics.hyracks.dataflow.std.group.external.ExternalGroupOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.hybridhash.HybridHashGroupOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.preclustered.PreclusteredGroupOperatorDescriptor;
+import edu.uci.ics.hyracks.hdfs.api.ITupleWriterFactory;
+import edu.uci.ics.hyracks.hdfs.dataflow.HDFSReadOperatorDescriptor;
+import edu.uci.ics.hyracks.hdfs.dataflow.HDFSWriteOperatorDescriptor;
+import edu.uci.ics.hyracks.hdfs.scheduler.Scheduler;
+
+public class JobGenBrujinGraph extends JobGen {
+ public enum GroupbyType {
+ EXTERNAL,
+ PRECLUSTER,
+ HYBRIDHASH,
+ }
+
+ public enum OutputFormat {
+ TEXT,
+ BINARY,
+ }
+
+ JobConf job;
+ private static final Log LOG = LogFactory.getLog(JobGenBrujinGraph.class);
+ private Scheduler scheduler;
+ private String[] ncNodeNames;
+
+ private int kmers;
+ private int frameLimits;
+ private int frameSize;
+ private int tableSize;
+ private GroupbyType groupbyType;
+ private OutputFormat outputFormat;
+ private boolean bGenerateReversedKmer;
+
+ private AbstractOperatorDescriptor singleGrouper;
+ private IConnectorDescriptor connPartition;
+ private AbstractOperatorDescriptor crossGrouper;
+ private RecordDescriptor readOutputRec;
+ private RecordDescriptor combineOutputRec;
+
+ /** works for hybrid hashing */
+ private long inputSizeInRawRecords;
+ private long inputSizeInUniqueKeys;
+ private int recordSizeInBytes;
+ private int hashfuncStartLevel;
+
+ private void logDebug(String status) {
+ String names = "";
+ for (String str : ncNodeNames) {
+ names += str + " ";
+ }
+ LOG.info(status + " nc nodes:" + ncNodeNames.length + " " + names);
+ }
+
+ public JobGenBrujinGraph(GenomixJob job, Scheduler scheduler, final Map<String, NodeControllerInfo> ncMap,
+ int numPartitionPerMachine) {
+ super(job);
+ this.scheduler = scheduler;
+ String[] nodes = new String[ncMap.size()];
+ ncMap.keySet().toArray(nodes);
+ ncNodeNames = new String[nodes.length * numPartitionPerMachine];
+ for (int i = 0; i < numPartitionPerMachine; i++) {
+ System.arraycopy(nodes, 0, ncNodeNames, i * nodes.length, nodes.length);
+ }
+ logDebug("initialize");
+ }
+
+ private ExternalGroupOperatorDescriptor newExternalGroupby(JobSpecification jobSpec, int[] keyFields,
+ IAggregatorDescriptorFactory aggeragater) {
+ return new ExternalGroupOperatorDescriptor(jobSpec, keyFields, frameLimits,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(KmerPointable.FACTORY) },
+ new KmerNormarlizedComputerFactory(), aggeragater, new DistributedMergeLmerAggregateFactory(),
+ combineOutputRec, new HashSpillableTableFactory(
+ new FieldHashPartitionComputerFactory(keyFields,
+ new IBinaryHashFunctionFactory[] { PointableBinaryHashFunctionFactory
+ .of(KmerPointable.FACTORY) }), tableSize), true);
+ }
+
+ private HybridHashGroupOperatorDescriptor newHybridGroupby(JobSpecification jobSpec, int[] keyFields,
+ long inputSizeInRawRecords, long inputSizeInUniqueKeys, int recordSizeInBytes, int hashfuncStartLevel,
+ IAggregatorDescriptorFactory aggeragater) throws HyracksDataException {
+ return new HybridHashGroupOperatorDescriptor(jobSpec, keyFields, frameLimits, inputSizeInRawRecords,
+ inputSizeInUniqueKeys, recordSizeInBytes, tableSize,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(KmerPointable.FACTORY) },
+ new IBinaryHashFunctionFamily[] { new KmerBinaryHashFunctionFamily() }, hashfuncStartLevel,
+ new KmerNormarlizedComputerFactory(), aggeragater, new DistributedMergeLmerAggregateFactory(),
+ combineOutputRec, true);
+ }
+
+ private void generateDescriptorbyType(JobSpecification jobSpec) throws HyracksDataException {
+ int[] keyFields = new int[] { 0 }; // the id of grouped key
+
+ switch (groupbyType) {
+ case EXTERNAL:
+ singleGrouper = newExternalGroupby(jobSpec, keyFields, new MergeKmerAggregateFactory());
+ connPartition = new MToNPartitioningConnectorDescriptor(jobSpec, new KmerHashPartitioncomputerFactory());
+ crossGrouper = newExternalGroupby(jobSpec, keyFields, new DistributedMergeLmerAggregateFactory());
+ break;
+ case PRECLUSTER:
+ singleGrouper = newExternalGroupby(jobSpec, keyFields, new MergeKmerAggregateFactory());
+ connPartition = new MToNPartitioningMergingConnectorDescriptor(jobSpec,
+ new KmerHashPartitioncomputerFactory(), keyFields,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(KmerPointable.FACTORY) });
+ crossGrouper = new PreclusteredGroupOperatorDescriptor(jobSpec, keyFields,
+ new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(KmerPointable.FACTORY) },
+ new DistributedMergeLmerAggregateFactory(), combineOutputRec);
+ break;
+ case HYBRIDHASH:
+ default:
+ singleGrouper = newHybridGroupby(jobSpec, keyFields, inputSizeInRawRecords, inputSizeInUniqueKeys,
+ recordSizeInBytes, hashfuncStartLevel, new MergeKmerAggregateFactory());
+ connPartition = new MToNPartitioningConnectorDescriptor(jobSpec, new KmerHashPartitioncomputerFactory());
+
+ crossGrouper = newHybridGroupby(jobSpec, keyFields, inputSizeInRawRecords, inputSizeInUniqueKeys,
+ recordSizeInBytes, hashfuncStartLevel, new DistributedMergeLmerAggregateFactory());
+ break;
+ }
+ }
+
+ public HDFSReadOperatorDescriptor createHDFSReader(JobSpecification jobSpec) throws HyracksDataException {
+ try {
+
+ InputSplit[] splits = job.getInputFormat().getSplits(job, ncNodeNames.length);
+
+ LOG.info("HDFS read into " + splits.length + " splits");
+ String[] readSchedule = scheduler.getLocationConstraints(splits);
+ String log = "";
+ for (String schedule : readSchedule) {
+ log += schedule + " ";
+ }
+ LOG.info("HDFS read schedule " + log);
+ return new HDFSReadOperatorDescriptor(jobSpec, readOutputRec, job, splits, readSchedule,
+ new ReadsKeyValueParserFactory(kmers, bGenerateReversedKmer));
+ } catch (Exception e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ @Override
+ public JobSpecification generateJob() throws HyracksException {
+
+ JobSpecification jobSpec = new JobSpecification();
+ readOutputRec = new RecordDescriptor(
+ new ISerializerDeserializer[] { null, ByteSerializerDeserializer.INSTANCE });
+ combineOutputRec = new RecordDescriptor(new ISerializerDeserializer[] { null,
+ ByteSerializerDeserializer.INSTANCE, ByteSerializerDeserializer.INSTANCE });
+ jobSpec.setFrameSize(frameSize);
+
+ // File input
+ HDFSReadOperatorDescriptor readOperator = createHDFSReader(jobSpec);
+
+ logDebug("Read Operator");
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, readOperator, ncNodeNames);
+
+ generateDescriptorbyType(jobSpec);
+ logDebug("SingleGroupby Operator");
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, singleGrouper, ncNodeNames);
+
+ IConnectorDescriptor readfileConn = new OneToOneConnectorDescriptor(jobSpec);
+ jobSpec.connect(readfileConn, readOperator, 0, singleGrouper, 0);
+
+ logDebug("CrossGrouper Operator");
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, crossGrouper, ncNodeNames);
+ jobSpec.connect(connPartition, singleGrouper, 0, crossGrouper, 0);
+
+ // Output
+ ITupleWriterFactory writer = null;
+ switch (outputFormat) {
+ case TEXT:
+ writer = new KMerTextWriterFactory(kmers);
+ break;
+ case BINARY:
+ default:
+ writer = new KMerSequenceWriterFactory(job);
+ break;
+ }
+ HDFSWriteOperatorDescriptor writeOperator = new HDFSWriteOperatorDescriptor(jobSpec, job, writer);
+
+ logDebug("WriteOperator");
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, writeOperator, ncNodeNames);
+
+ IConnectorDescriptor printConn = new OneToOneConnectorDescriptor(jobSpec);
+ jobSpec.connect(printConn, crossGrouper, 0, writeOperator, 0);
+ jobSpec.addRoot(writeOperator);
+
+ if (groupbyType == GroupbyType.PRECLUSTER) {
+ jobSpec.setConnectorPolicyAssignmentPolicy(new ConnectorPolicyAssignmentPolicy());
+ }
+ return jobSpec;
+ }
+
+ @Override
+ protected void initJobConfiguration() {
+
+ kmers = conf.getInt(GenomixJob.KMER_LENGTH, GenomixJob.DEFAULT_KMER);
+ if (kmers % 2 == 0) {
+ kmers--;
+ conf.setInt(GenomixJob.KMER_LENGTH, kmers);
+ }
+ frameLimits = conf.getInt(GenomixJob.FRAME_LIMIT, GenomixJob.DEFAULT_FRAME_LIMIT);
+ tableSize = conf.getInt(GenomixJob.TABLE_SIZE, GenomixJob.DEFAULT_TABLE_SIZE);
+ frameSize = conf.getInt(GenomixJob.FRAME_SIZE, GenomixJob.DEFAULT_FRAME_SIZE);
+ inputSizeInRawRecords = conf.getLong(GenomixJob.GROUPBY_HYBRID_INPUTSIZE,
+ GenomixJob.DEFAULT_GROUPBY_HYBRID_INPUTSIZE);
+ inputSizeInUniqueKeys = conf.getLong(GenomixJob.GROUPBY_HYBRID_INPUTKEYS,
+ GenomixJob.DEFAULT_GROUPBY_HYBRID_INPUTKEYS);
+ recordSizeInBytes = conf.getInt(GenomixJob.GROUPBY_HYBRID_RECORDSIZE_SINGLE,
+ GenomixJob.DEFAULT_GROUPBY_HYBRID_RECORDSIZE_SINGLE);
+ hashfuncStartLevel = conf.getInt(GenomixJob.GROUPBY_HYBRID_HASHLEVEL,
+ GenomixJob.DEFAULT_GROUPBY_HYBRID_HASHLEVEL);
+ /** here read the different recordSize why ? */
+ recordSizeInBytes = conf.getInt(GenomixJob.GROUPBY_HYBRID_RECORDSIZE_CROSS,
+ GenomixJob.DEFAULT_GROUPBY_HYBRID_RECORDSIZE_CROSS);
+
+ bGenerateReversedKmer = conf.getBoolean(GenomixJob.REVERSED_KMER, GenomixJob.DEFAULT_REVERSED);
+
+ String type = conf.get(GenomixJob.GROUPBY_TYPE, GenomixJob.DEFAULT_GROUPBY_TYPE);
+ if (type.equalsIgnoreCase("external")) {
+ groupbyType = GroupbyType.EXTERNAL;
+ } else if (type.equalsIgnoreCase("precluster")) {
+ groupbyType = GroupbyType.PRECLUSTER;
+ } else {
+ groupbyType = GroupbyType.HYBRIDHASH;
+ }
+
+ String output = conf.get(GenomixJob.OUTPUT_FORMAT, GenomixJob.DEFAULT_OUTPUT_FORMAT);
+ if (output.equalsIgnoreCase("text")) {
+ outputFormat = OutputFormat.TEXT;
+ } else {
+ outputFormat = OutputFormat.BINARY;
+ }
+ job = new JobConf(conf);
+ LOG.info("Genomix Graph Build Configuration");
+ LOG.info("Kmer:" + kmers);
+ LOG.info("Groupby type:" + type);
+ LOG.info("Output format:" + output);
+ LOG.info("Frame limit" + frameLimits);
+ LOG.info("Frame size" + frameSize);
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenStatistic.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenStatistic.java
new file mode 100644
index 0000000..a88fa79
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/job/JobGenStatistic.java
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.job;
+
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+
+import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
+import edu.uci.ics.genomix.util.ByteComparatorFactory;
+import edu.uci.ics.genomix.util.StatCountAggregateFactory;
+import edu.uci.ics.genomix.util.StatReadsKeyValueParserFactory;
+import edu.uci.ics.genomix.util.StatSumAggregateFactory;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.constraints.PartitionConstraintHelper;
+import edu.uci.ics.hyracks.api.dataflow.IConnectorDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputerFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.exceptions.HyracksException;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFactory;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.connectors.MToNPartitioningMergingConnectorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.file.AbstractFileWriteOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.file.LineFileWriteOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.HashSpillableTableFactory;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+import edu.uci.ics.hyracks.dataflow.std.group.external.ExternalGroupOperatorDescriptor;
+import edu.uci.ics.hyracks.hdfs.dataflow.HDFSReadOperatorDescriptor;
+import edu.uci.ics.hyracks.hdfs.scheduler.Scheduler;
+
+public class JobGenStatistic extends JobGen {
+ private int kmers;
+ private JobConf hadoopjob;
+ private RecordDescriptor readOutputRec;
+ private String[] ncNodeNames;
+ private Scheduler scheduler;
+ private RecordDescriptor combineOutputRec;
+
+ public JobGenStatistic(GenomixJob job) {
+ super(job);
+ // TODO Auto-generated constructor stub
+ }
+
+ @Override
+ protected void initJobConfiguration() {
+ // TODO Auto-generated method stub
+ kmers = conf.getInt(GenomixJob.KMER_LENGTH, GenomixJob.DEFAULT_KMER);
+ hadoopjob = new JobConf(conf);
+ hadoopjob.setInputFormat(SequenceFileInputFormat.class);
+ }
+
+ @Override
+ public JobSpecification generateJob() throws HyracksException {
+ int[] degreeFields = { 0, 1 }; // indegree, outdegree
+ int[] countFields = { 2 };
+ JobSpecification jobSpec = new JobSpecification();
+ /** specify the record fields after read */
+ readOutputRec = new RecordDescriptor(new ISerializerDeserializer[] { ByteSerializerDeserializer.INSTANCE,
+ ByteSerializerDeserializer.INSTANCE, ByteSerializerDeserializer.INSTANCE });
+ combineOutputRec = new RecordDescriptor(new ISerializerDeserializer[] { ByteSerializerDeserializer.INSTANCE,
+ ByteSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE });
+ /** the reader */
+ HDFSReadOperatorDescriptor readOperator = createHDFSReader(jobSpec);
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, readOperator, ncNodeNames);
+
+ /** the combiner aggregator */
+ AbstractOperatorDescriptor degreeLocal = connectLocalAggregateByField(jobSpec, degreeFields, readOperator);
+ AbstractOperatorDescriptor countLocal = connectLocalAggregateByField(jobSpec, countFields, readOperator);
+
+ /** the final aggregator */
+ AbstractOperatorDescriptor degreeMerger = connectFinalAggregateByField(jobSpec, degreeFields, degreeLocal);
+ AbstractOperatorDescriptor countMerger = connectFinalAggregateByField(jobSpec, countFields, countLocal);
+
+ /** writer */
+ AbstractFileWriteOperatorDescriptor writeDegree = connectWriter(jobSpec, degreeFields, degreeMerger);
+ AbstractFileWriteOperatorDescriptor writeCount = connectWriter(jobSpec, countFields, countMerger);
+ jobSpec.addRoot(writeDegree);
+ jobSpec.addRoot(writeCount);
+ return null;
+ }
+
+ private HDFSReadOperatorDescriptor createHDFSReader(JobSpecification jobSpec) throws HyracksDataException {
+ try {
+
+ InputSplit[] splits = hadoopjob.getInputFormat().getSplits(hadoopjob, ncNodeNames.length);
+
+ String[] readSchedule = scheduler.getLocationConstraints(splits);
+ return new HDFSReadOperatorDescriptor(jobSpec, readOutputRec, hadoopjob, splits, readSchedule,
+ new StatReadsKeyValueParserFactory());
+ } catch (Exception e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ private ExternalGroupOperatorDescriptor newExternalGroupby(JobSpecification jobSpec, int[] keyFields,
+ IAggregatorDescriptorFactory aggeragater) {
+ return new ExternalGroupOperatorDescriptor(jobSpec, keyFields, GenomixJob.DEFAULT_FRAME_LIMIT,
+ new IBinaryComparatorFactory[] { new ByteComparatorFactory(), new ByteComparatorFactory() }, null,
+ aggeragater, new StatSumAggregateFactory(), combineOutputRec, new HashSpillableTableFactory(
+ new FieldHashPartitionComputerFactory(keyFields, new IBinaryHashFunctionFactory[] {
+ new ByteComparatorFactory(), new ByteComparatorFactory() }),
+ GenomixJob.DEFAULT_TABLE_SIZE), true);
+ }
+
+ private AbstractOperatorDescriptor connectLocalAggregateByField(JobSpecification jobSpec, int[] fields,
+ HDFSReadOperatorDescriptor readOperator) {
+ AbstractOperatorDescriptor localAggregator = newExternalGroupby(jobSpec, fields,
+ new StatCountAggregateFactory());
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, localAggregator, ncNodeNames);
+ IConnectorDescriptor readfileConn = new OneToOneConnectorDescriptor(jobSpec);
+ jobSpec.connect(readfileConn, readOperator, 0, localAggregator, 0);
+ return localAggregator;
+ }
+
+ private AbstractOperatorDescriptor connectFinalAggregateByField(JobSpecification jobSpec, int[] fields,
+ AbstractOperatorDescriptor localAggregator) {
+ AbstractOperatorDescriptor finalAggregator = newExternalGroupby(jobSpec, fields, new StatSumAggregateFactory());
+ // only need one reducer
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, finalAggregator, ncNodeNames[fields[0]
+ % ncNodeNames.length]);
+ IConnectorDescriptor mergeConn = new MToNPartitioningMergingConnectorDescriptor(jobSpec,
+ new ITuplePartitionComputerFactory() {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public ITuplePartitionComputer createPartitioner() {
+ return new ITuplePartitionComputer() {
+ @Override
+ public int partition(IFrameTupleAccessor accessor, int tIndex, int nParts)
+ throws HyracksDataException {
+ return 0;
+ }
+ };
+ }
+ }, fields, new IBinaryComparatorFactory[] { new ByteComparatorFactory() });
+ jobSpec.connect(mergeConn, localAggregator, 0, finalAggregator, 0);
+ return finalAggregator;
+ }
+
+ private AbstractFileWriteOperatorDescriptor connectWriter(JobSpecification jobSpec, int[] fields,
+ AbstractOperatorDescriptor finalAggregator) {
+ LineFileWriteOperatorDescriptor writeOperator = new LineFileWriteOperatorDescriptor(jobSpec, null);
+ PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, writeOperator, ncNodeNames[fields[0]
+ % ncNodeNames.length]);
+
+ IConnectorDescriptor printConn = new OneToOneConnectorDescriptor(jobSpec);
+ jobSpec.connect(printConn, finalAggregator, 0, writeOperator, 0);
+ return writeOperator;
+ }
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/ByteComparatorFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/ByteComparatorFactory.java
new file mode 100644
index 0000000..832966b
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/ByteComparatorFactory.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.util;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFactory;
+
+public class ByteComparatorFactory implements IBinaryComparatorFactory, IBinaryHashFunctionFactory {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public IBinaryComparator createBinaryComparator() {
+ return new IBinaryComparator() {
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ return b1[s1] - b2[s2];
+ }
+
+ };
+ }
+
+ @Override
+ public IBinaryHashFunction createBinaryHashFunction() {
+ return new IBinaryHashFunction() {
+
+ @Override
+ public int hash(byte[] bytes, int offset, int length) {
+ return bytes[offset];
+ }
+
+ };
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatCountAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatCountAggregateFactory.java
new file mode 100644
index 0000000..65303a8
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatCountAggregateFactory.java
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.util;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+
+public class StatCountAggregateFactory implements IAggregatorDescriptorFactory {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ public class CountAggregator implements IAggregatorDescriptor {
+ private final int[] keyFields;
+
+ public CountAggregator(int[] keyFields) {
+ this.keyFields = keyFields;
+ }
+
+ @Override
+ public AggregateState createAggregateStates() {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public void init(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex, AggregateState state)
+ throws HyracksDataException {
+ int count = 1;
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeInt(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when initializing the aggregator.");
+ }
+ }
+
+ @Override
+ public void reset() {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void aggregate(IFrameTupleAccessor accessor, int tIndex, IFrameTupleAccessor stateAccessor,
+ int stateTupleIndex, AggregateState state) throws HyracksDataException {
+ int count = 1;
+
+ int statetupleOffset = stateAccessor.getTupleStartOffset(stateTupleIndex);
+ int countfieldStart = stateAccessor.getFieldStartOffset(stateTupleIndex, keyFields.length);
+ int countoffset = statetupleOffset + stateAccessor.getFieldSlotsLength() + countfieldStart;
+
+ byte[] data = stateAccessor.getBuffer().array();
+ count += IntegerSerializerDeserializer.getInt(data, countoffset);
+ IntegerSerializerDeserializer.putInt(count, data, countoffset);
+ }
+
+ @Override
+ public void outputPartialResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ int count = getCount(accessor, tIndex);
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeInt(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when writing aggregation to the output buffer.");
+ }
+
+ }
+
+ protected int getCount(IFrameTupleAccessor accessor, int tIndex) {
+ int tupleOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldStart = accessor.getFieldStartOffset(tIndex, keyFields.length);
+ int countoffset = tupleOffset + accessor.getFieldSlotsLength() + fieldStart;
+ byte[] data = accessor.getBuffer().array();
+
+ return IntegerSerializerDeserializer.getInt(data, countoffset);
+ }
+
+ @Override
+ public void outputFinalResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ outputPartialResult(tupleBuilder, accessor, tIndex, state);
+ }
+
+ @Override
+ public void close() {
+ // TODO Auto-generated method stub
+
+ }
+
+ }
+
+ @Override
+ public IAggregatorDescriptor createAggregator(IHyracksTaskContext ctx, RecordDescriptor inRecordDescriptor,
+ RecordDescriptor outRecordDescriptor, int[] keyFields, int[] keyFieldsInPartialResults)
+ throws HyracksDataException {
+ // TODO Auto-generated method stub
+ return new CountAggregator(keyFields);
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatReadsKeyValueParserFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatReadsKeyValueParserFactory.java
new file mode 100644
index 0000000..2fcca67
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatReadsKeyValueParserFactory.java
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.util;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.genomix.data.std.accessors.ByteSerializerDeserializer;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.hyracks.api.comm.IFrameWriter;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.hdfs.api.IKeyValueParser;
+import edu.uci.ics.hyracks.hdfs.api.IKeyValueParserFactory;
+
+public class StatReadsKeyValueParserFactory implements IKeyValueParserFactory<KmerBytesWritable, KmerCountValue> {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public IKeyValueParser<KmerBytesWritable, KmerCountValue> createKeyValueParser(IHyracksTaskContext ctx)
+ throws HyracksDataException {
+
+ final ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(2);
+ final ByteBuffer outputBuffer = ctx.allocateFrame();
+ final FrameTupleAppender outputAppender = new FrameTupleAppender(ctx.getFrameSize());
+ outputAppender.reset(outputBuffer, true);
+
+ return new IKeyValueParser<KmerBytesWritable, KmerCountValue>() {
+
+ @Override
+ public void open(IFrameWriter writer) throws HyracksDataException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void parse(KmerBytesWritable key, KmerCountValue value, IFrameWriter writer)
+ throws HyracksDataException {
+ byte adjMap = value.getAdjBitMap();
+ byte count = value.getCount();
+ InsertToFrame((byte) (GeneCode.inDegree(adjMap)), (byte) (GeneCode.outDegree(adjMap)), count, writer);
+ }
+
+ @Override
+ public void close(IFrameWriter writer) throws HyracksDataException {
+ FrameUtils.flushFrame(outputBuffer, writer);
+ }
+
+ private void InsertToFrame(byte indegree, byte outdegree, byte count, IFrameWriter writer) {
+ try {
+ tupleBuilder.reset();
+ tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE, indegree);
+ tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE, outdegree);
+ tupleBuilder.addField(ByteSerializerDeserializer.INSTANCE, count);
+
+ if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
+ tupleBuilder.getSize())) {
+ FrameUtils.flushFrame(outputBuffer, writer);
+ outputAppender.reset(outputBuffer, true);
+ if (!outputAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
+ tupleBuilder.getSize())) {
+ throw new IllegalStateException(
+ "Failed to copy an record into a frame: the record size is too large.");
+ }
+ }
+ } catch (Exception e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ };
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatSumAggregateFactory.java b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatSumAggregateFactory.java
new file mode 100644
index 0000000..39ac60a
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/java/edu/uci/ics/genomix/util/StatSumAggregateFactory.java
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.util;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+
+public class StatSumAggregateFactory implements IAggregatorDescriptorFactory {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ public class DistributeAggregatorDescriptor implements IAggregatorDescriptor {
+
+ private final int[] keyFields;
+
+ public DistributeAggregatorDescriptor(int[] keyFields) {
+ this.keyFields = keyFields;
+ }
+
+ @Override
+ public AggregateState createAggregateStates() {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ protected int getCount(IFrameTupleAccessor accessor, int tIndex) {
+ int tupleOffset = accessor.getTupleStartOffset(tIndex);
+ int fieldStart = accessor.getFieldStartOffset(tIndex, 1);
+ int countoffset = tupleOffset + accessor.getFieldSlotsLength() + fieldStart;
+ byte[] data = accessor.getBuffer().array();
+ return IntegerSerializerDeserializer.getInt(data, countoffset);
+ }
+
+ @Override
+ public void init(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex, AggregateState state)
+ throws HyracksDataException {
+ int count = getCount(accessor, tIndex);
+
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeInt(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when initializing the aggregator.");
+ }
+ }
+
+ @Override
+ public void reset() {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void aggregate(IFrameTupleAccessor accessor, int tIndex, IFrameTupleAccessor stateAccessor,
+ int stateTupleIndex, AggregateState state) throws HyracksDataException {
+ int count = getCount(accessor, tIndex);
+
+ int statetupleOffset = stateAccessor.getTupleStartOffset(stateTupleIndex);
+ int countfieldStart = stateAccessor.getFieldStartOffset(stateTupleIndex, 1);
+ int countoffset = statetupleOffset + stateAccessor.getFieldSlotsLength() + countfieldStart;
+
+ byte[] data = stateAccessor.getBuffer().array();
+ count += IntegerSerializerDeserializer.getInt(data, countoffset);
+ IntegerSerializerDeserializer.putInt(count, data, countoffset);
+ }
+
+ @Override
+ public void outputPartialResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ int count = getCount(accessor, tIndex);
+ DataOutput fieldOutput = tupleBuilder.getDataOutput();
+ try {
+ fieldOutput.writeInt(count);
+ tupleBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException("I/O exception when writing aggregation to the output buffer.");
+ }
+
+ }
+
+ @Override
+ public void outputFinalResult(ArrayTupleBuilder tupleBuilder, IFrameTupleAccessor accessor, int tIndex,
+ AggregateState state) throws HyracksDataException {
+ outputPartialResult(tupleBuilder, accessor, tIndex, state);
+
+ }
+
+ @Override
+ public void close() {
+ // TODO Auto-generated method stub
+
+ }
+
+ }
+
+ @Override
+ public IAggregatorDescriptor createAggregator(IHyracksTaskContext ctx, RecordDescriptor inRecordDescriptor,
+ RecordDescriptor outRecordDescriptor, int[] keyFields, int[] keyFieldsInPartialResults)
+ throws HyracksDataException {
+ // TODO Auto-generated method stub
+ return new DistributeAggregatorDescriptor(keyFields);
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/main/resources/conf/cluster.properties b/genomix/genomix-hyracks/src/main/resources/conf/cluster.properties
new file mode 100644
index 0000000..66251be
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/conf/cluster.properties
@@ -0,0 +1,41 @@
+#The CC port for Hyracks clients
+CC_CLIENTPORT=3099
+
+#The CC port for Hyracks cluster management
+CC_CLUSTERPORT=1099
+
+#The directory of hyracks binaries
+HYRACKS_HOME="../../../../hyracks"
+
+WORKPATH=""
+#The tmp directory for cc to install jars
+CCTMP_DIR=${WORKPATH}/tmp/t1
+
+#The tmp directory for nc to install jars
+NCTMP_DIR=${WORKPATH}/tmp/t2
+
+#The directory to put cc logs
+CCLOGS_DIR=$CCTMP_DIR/logs
+
+#The directory to put nc logs
+NCLOGS_DIR=$NCTMP_DIR/logs
+
+#Comma separated I/O directories for the spilling of external sort
+IO_DIRS="${WORKPATH}/tmp/t3"
+
+#The JAVA_HOME
+JAVA_HOME=$JAVA_HOME
+
+#HADOOP_HOME
+CLASSPATH="${HADOOP_HOME}:${CLASSPATH}:."
+
+#The frame size of the internal dataflow engine
+FRAME_SIZE=65536
+
+#CC JAVA_OPTS
+CCJAVA_OPTS="-Xrunjdwp:transport=dt_socket,address=7001,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+# Yourkit option: -agentpath:/grid/0/dev/vborkar/tools/yjp-10.0.4/bin/linux-x86-64/libyjpagent.so=port=20001"
+
+#NC JAVA_OPTS
+NCJAVA_OPTS="-Xrunjdwp:transport=dt_socket,address=7002,server=y,suspend=n -Xmx10g -Djava.util.logging.config.file=logging.properties"
+
diff --git a/genomix/genomix-hyracks/src/main/resources/conf/debugnc.properties b/genomix/genomix-hyracks/src/main/resources/conf/debugnc.properties
new file mode 100644
index 0000000..27afa26
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/conf/debugnc.properties
@@ -0,0 +1,12 @@
+#The tmp directory for nc to install jars
+NCTMP_DIR2=/tmp/t-1
+
+#The directory to put nc logs
+NCLOGS_DIR2=$NCTMP_DIR/logs
+
+#Comma separated I/O directories for the spilling of external sort
+IO_DIRS2="/tmp/t-2,/tmp/t-3"
+
+#NC JAVA_OPTS
+NCJAVA_OPTS2="-Xdebug -Xrunjdwp:transport=dt_socket,address=7003,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+
diff --git a/genomix/genomix-hyracks/src/main/resources/conf/master b/genomix/genomix-hyracks/src/main/resources/conf/master
new file mode 100644
index 0000000..2fbb50c
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/conf/master
@@ -0,0 +1 @@
+localhost
diff --git a/genomix/genomix-hyracks/src/main/resources/conf/slaves b/genomix/genomix-hyracks/src/main/resources/conf/slaves
new file mode 100644
index 0000000..2fbb50c
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/conf/slaves
@@ -0,0 +1 @@
+localhost
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/genomix b/genomix/genomix-hyracks/src/main/resources/scripts/genomix
new file mode 100644
index 0000000..239a46c
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/genomix
@@ -0,0 +1,113 @@
+#!/bin/sh
+# ----------------------------------------------------------------------------
+# Copyright 2001-2006 The Apache Software Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+#
+# Copyright (c) 2001-2006 The Apache Software Foundation. All rights
+# reserved.
+
+
+# resolve links - $0 may be a softlink
+PRG="$0"
+
+while [ -h "$PRG" ]; do
+ ls=`ls -ld "$PRG"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '/.*' > /dev/null; then
+ PRG="$link"
+ else
+ PRG=`dirname "$PRG"`/"$link"
+ fi
+done
+
+PRGDIR=`dirname "$PRG"`
+BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd`
+
+
+
+# OS specific support. $var _must_ be set to either true or false.
+cygwin=false;
+darwin=false;
+case "`uname`" in
+ CYGWIN*) cygwin=true ;;
+ Darwin*) darwin=true
+ if [ -z "$JAVA_VERSION" ] ; then
+ JAVA_VERSION="CurrentJDK"
+ else
+ echo "Using Java version: $JAVA_VERSION"
+ fi
+ if [ -z "$JAVA_HOME" ] ; then
+ JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home
+ fi
+ ;;
+esac
+
+if [ -z "$JAVA_HOME" ] ; then
+ if [ -r /etc/gentoo-release ] ; then
+ JAVA_HOME=`java-config --jre-home`
+ fi
+fi
+
+# For Cygwin, ensure paths are in UNIX format before anything is touched
+if $cygwin ; then
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
+ [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
+fi
+
+# If a specific java binary isn't specified search for the standard 'java' binary
+if [ -z "$JAVACMD" ] ; then
+ if [ -n "$JAVA_HOME" ] ; then
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+ # IBM's JDK on AIX uses strange locations for the executables
+ JAVACMD="$JAVA_HOME/jre/sh/java"
+ else
+ JAVACMD="$JAVA_HOME/bin/java"
+ fi
+ else
+ JAVACMD=`which java`
+ fi
+fi
+
+if [ ! -x "$JAVACMD" ] ; then
+ echo "Error: JAVA_HOME is not defined correctly." 1>&2
+ echo " We cannot execute $JAVACMD" 1>&2
+ exit 1
+fi
+
+if [ -z "$REPO" ]
+then
+ REPO="$BASEDIR"/lib
+fi
+
+CLASSPATH=$CLASSPATH_PREFIX:"$BASEDIR"/etc:"$REPO"/hyracks-dataflow-std-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-api-0.2.3-SNAPSHOT.jar:"$REPO"/json-20090211.jar:"$REPO"/httpclient-4.1-alpha2.jar:"$REPO"/httpcore-4.1-beta1.jar:"$REPO"/commons-logging-1.1.1.jar:"$REPO"/args4j-2.0.12.jar:"$REPO"/commons-lang3-3.1.jar:"$REPO"/hyracks-dataflow-common-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-data-std-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-control-cc-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-control-common-0.2.3-SNAPSHOT.jar:"$REPO"/jetty-server-8.0.0.RC0.jar:"$REPO"/servlet-api-3.0.20100224.jar:"$REPO"/jetty-continuation-8.0.0.RC0.jar:"$REPO"/jetty-http-8.0.0.RC0.jar:"$REPO"/jetty-io-8.0.0.RC0.jar:"$REPO"/jetty-webapp-8.0.0.RC0.jar:"$REPO"/jetty-xml-8.0.0.RC0.jar:"$REPO"/jetty-util-8.0.0.RC0.jar:"$REPO"/jetty-servlet-8.0.0.RC0.jar:"$REPO"/jetty-security-8.0.0.RC0.jar:"$REPO"/wicket-core-1.5.2.jar:"$REPO"/wicket-util-1.5.2.jar:"$REPO"/wicket-request-1.5.2.jar:"$REPO"/slf4j-api-1.6.1.jar:"$REPO"/slf4j-jcl-1.6.3.jar:"$REPO"/hyracks-control-nc-0.2.3-SNAPSHOT.jar:"$REPO"/dcache-client-0.0.1.jar:"$REPO"/jetty-client-8.0.0.M0.jar:"$REPO"/hyracks-net-0.2.3-SNAPSHOT.jar:"$REPO"/commons-io-1.3.1.jar:"$REPO"/hyracks-ipc-0.2.3-SNAPSHOT.jar:"$REPO"/hadoop-core-0.20.2.jar:"$REPO"/commons-cli-1.2.jar:"$REPO"/xmlenc-0.52.jar:"$REPO"/commons-httpclient-3.0.1.jar:"$REPO"/commons-codec-1.3.jar:"$REPO"/commons-net-1.4.1.jar:"$REPO"/jetty-6.1.14.jar:"$REPO"/jetty-util-6.1.14.jar:"$REPO"/jasper-runtime-5.5.12.jar:"$REPO"/jasper-compiler-5.5.12.jar:"$REPO"/jsp-api-2.1-6.1.14.jar:"$REPO"/jsp-2.1-6.1.14.jar:"$REPO"/ant-1.6.5.jar:"$REPO"/commons-el-1.0.jar:"$REPO"/jets3t-0.7.1.jar:"$REPO"/servlet-api-2.5-6.1.14.jar:"$REPO"/kfs-0.3.jar:"$REPO"/hsqldb-1.8.0.10.jar:"$REPO"/oro-2.0.8.jar:"$REPO"/core-3.1.1.jar:"$REPO"/hadoop-test-0.20.2.jar:"$REPO"/ftplet-api-1.0.0.jar:"$REPO"/mina-core-2.0.0-M5.jar:"$REPO"/ftpserver-core-1.0.0.jar:"$REPO"/ftpserver-deprecated-1.0.0-M2.jar:"$REPO"/hyracks-hdfs-core-0.2.3-SNAPSHOT.jar:"$REPO"/hyracks-hdfs-0.20.2-0.2.3-SNAPSHOT.jar:"$REPO"/genomix-data-0.2.3-SNAPSHOT.jar:"$REPO"/genomix-hyracks-0.2.3-SNAPSHOT.jar
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin; then
+ [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
+ [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"`
+ [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"`
+ [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"`
+fi
+
+exec "$JAVACMD" $JAVA_OPTS \
+ -classpath "$CLASSPATH" \
+ -Dapp.name="genomix" \
+ -Dapp.pid="$$" \
+ -Dapp.repo="$REPO" \
+ -Dapp.home="$BASEDIR" \
+ -Dbasedir="$BASEDIR" \
+ edu.uci.ics.genomix.driver.Driver \
+ "$@"
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/genomix.bat b/genomix/genomix-hyracks/src/main/resources/scripts/genomix.bat
new file mode 100644
index 0000000..abcafaf
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/genomix.bat
@@ -0,0 +1,108 @@
+@REM ----------------------------------------------------------------------------
+@REM Copyright 2001-2006 The Apache Software Foundation.
+@REM
+@REM Licensed under the Apache License, Version 2.0 (the "License");
+@REM you may not use this file except in compliance with the License.
+@REM You may obtain a copy of the License at
+@REM
+@REM http://www.apache.org/licenses/LICENSE-2.0
+@REM
+@REM Unless required by applicable law or agreed to in writing, software
+@REM distributed under the License is distributed on an "AS IS" BASIS,
+@REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@REM See the License for the specific language governing permissions and
+@REM limitations under the License.
+@REM ----------------------------------------------------------------------------
+@REM
+@REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights
+@REM reserved.
+
+@echo off
+
+set ERROR_CODE=0
+
+:init
+@REM Decide how to startup depending on the version of windows
+
+@REM -- Win98ME
+if NOT "%OS%"=="Windows_NT" goto Win9xArg
+
+@REM set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" @setlocal
+
+@REM -- 4NT shell
+if "%eval[2+2]" == "4" goto 4NTArgs
+
+@REM -- Regular WinNT shell
+set CMD_LINE_ARGS=%*
+goto WinNTGetScriptDir
+
+@REM The 4NT Shell from jp software
+:4NTArgs
+set CMD_LINE_ARGS=%$
+goto WinNTGetScriptDir
+
+:Win9xArg
+@REM Slurp the command line arguments. This loop allows for an unlimited number
+@REM of arguments (up to the command line limit, anyway).
+set CMD_LINE_ARGS=
+:Win9xApp
+if %1a==a goto Win9xGetScriptDir
+set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1
+shift
+goto Win9xApp
+
+:Win9xGetScriptDir
+set SAVEDIR=%CD%
+%0\
+cd %0\..\..
+set BASEDIR=%CD%
+cd %SAVEDIR%
+set SAVE_DIR=
+goto repoSetup
+
+:WinNTGetScriptDir
+set BASEDIR=%~dp0\..
+
+:repoSetup
+
+
+if "%JAVACMD%"=="" set JAVACMD=java
+
+if "%REPO%"=="" set REPO=%BASEDIR%\lib
+
+set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\hyracks-dataflow-std-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-api-0.2.3-SNAPSHOT.jar;"%REPO%"\json-20090211.jar;"%REPO%"\httpclient-4.1-alpha2.jar;"%REPO%"\httpcore-4.1-beta1.jar;"%REPO%"\commons-logging-1.1.1.jar;"%REPO%"\args4j-2.0.12.jar;"%REPO%"\commons-lang3-3.1.jar;"%REPO%"\hyracks-dataflow-common-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-data-std-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-control-cc-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-control-common-0.2.3-SNAPSHOT.jar;"%REPO%"\jetty-server-8.0.0.RC0.jar;"%REPO%"\servlet-api-3.0.20100224.jar;"%REPO%"\jetty-continuation-8.0.0.RC0.jar;"%REPO%"\jetty-http-8.0.0.RC0.jar;"%REPO%"\jetty-io-8.0.0.RC0.jar;"%REPO%"\jetty-webapp-8.0.0.RC0.jar;"%REPO%"\jetty-xml-8.0.0.RC0.jar;"%REPO%"\jetty-util-8.0.0.RC0.jar;"%REPO%"\jetty-servlet-8.0.0.RC0.jar;"%REPO%"\jetty-security-8.0.0.RC0.jar;"%REPO%"\wicket-core-1.5.2.jar;"%REPO%"\wicket-util-1.5.2.jar;"%REPO%"\wicket-request-1.5.2.jar;"%REPO%"\slf4j-api-1.6.1.jar;"%REPO%"\slf4j-jcl-1.6.3.jar;"%REPO%"\hyracks-control-nc-0.2.3-SNAPSHOT.jar;"%REPO%"\dcache-client-0.0.1.jar;"%REPO%"\jetty-client-8.0.0.M0.jar;"%REPO%"\hyracks-net-0.2.3-SNAPSHOT.jar;"%REPO%"\commons-io-1.3.1.jar;"%REPO%"\hyracks-ipc-0.2.3-SNAPSHOT.jar;"%REPO%"\hadoop-core-0.20.2.jar;"%REPO%"\commons-cli-1.2.jar;"%REPO%"\xmlenc-0.52.jar;"%REPO%"\commons-httpclient-3.0.1.jar;"%REPO%"\commons-codec-1.3.jar;"%REPO%"\commons-net-1.4.1.jar;"%REPO%"\jetty-6.1.14.jar;"%REPO%"\jetty-util-6.1.14.jar;"%REPO%"\jasper-runtime-5.5.12.jar;"%REPO%"\jasper-compiler-5.5.12.jar;"%REPO%"\jsp-api-2.1-6.1.14.jar;"%REPO%"\jsp-2.1-6.1.14.jar;"%REPO%"\ant-1.6.5.jar;"%REPO%"\commons-el-1.0.jar;"%REPO%"\jets3t-0.7.1.jar;"%REPO%"\servlet-api-2.5-6.1.14.jar;"%REPO%"\kfs-0.3.jar;"%REPO%"\hsqldb-1.8.0.10.jar;"%REPO%"\oro-2.0.8.jar;"%REPO%"\core-3.1.1.jar;"%REPO%"\hadoop-test-0.20.2.jar;"%REPO%"\ftplet-api-1.0.0.jar;"%REPO%"\mina-core-2.0.0-M5.jar;"%REPO%"\ftpserver-core-1.0.0.jar;"%REPO%"\ftpserver-deprecated-1.0.0-M2.jar;"%REPO%"\hyracks-hdfs-core-0.2.3-SNAPSHOT.jar;"%REPO%"\hyracks-hdfs-0.20.2-0.2.3-SNAPSHOT.jar;"%REPO%"\genomix-data-0.2.3-SNAPSHOT.jar;"%REPO%"\genomix-hyracks-0.2.3-SNAPSHOT.jar
+goto endInit
+
+@REM Reaching here means variables are defined and arguments have been captured
+:endInit
+
+%JAVACMD% %JAVA_OPTS% -classpath %CLASSPATH_PREFIX%;%CLASSPATH% -Dapp.name="genomix" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.uci.ics.genomix.driver.Driver %CMD_LINE_ARGS%
+if ERRORLEVEL 1 goto error
+goto end
+
+:error
+if "%OS%"=="Windows_NT" @endlocal
+set ERROR_CODE=%ERRORLEVEL%
+
+:end
+@REM set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" goto endNT
+
+@REM For old DOS remove the set variables from ENV - we assume they were not set
+@REM before we started - at least we don't leave any baggage around
+set CMD_LINE_ARGS=
+goto postExec
+
+:endNT
+@REM If error code is set to 1 then the endlocal was done already in :error.
+if %ERROR_CODE% EQU 0 @endlocal
+
+
+:postExec
+
+if "%FORCE_EXIT_ON_ERROR%" == "on" (
+ if %ERROR_CODE% NEQ 0 exit %ERROR_CODE%
+)
+
+exit /B %ERROR_CODE%
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/getip.sh b/genomix/genomix-hyracks/src/main/resources/scripts/getip.sh
new file mode 100644
index 0000000..e0cdf73
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/getip.sh
@@ -0,0 +1,21 @@
+#get the OS
+OS_NAME=`uname -a|awk '{print $1}'`
+LINUX_OS='Linux'
+
+if [ $OS_NAME = $LINUX_OS ];
+then
+ #Get IP Address
+ IPADDR=`/sbin/ifconfig eth0 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ if [ "$IPADDR" = "" ]
+ then
+ IPADDR=`/sbin/ifconfig lo | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ fi
+else
+ IPADDR=`/sbin/ifconfig en1 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ if [ "$IPADDR" = "" ]
+ then
+ IPADDR=`/sbin/ifconfig lo0 | grep "inet " | awk '{print $2}' | cut -f 2 -d ':'`
+ fi
+
+fi
+echo $IPADDR
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/startAllNCs.sh b/genomix/genomix-hyracks/src/main/resources/scripts/startAllNCs.sh
new file mode 100644
index 0000000..28fcb84
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/startAllNCs.sh
@@ -0,0 +1,6 @@
+GENOMIX_PATH=`pwd`
+
+for i in `cat conf/slaves`
+do
+ ssh $i "cd ${GENOMIX_PATH}; export JAVA_HOME=${JAVA_HOME}; bin/startnc.sh"
+done
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/startCluster.sh b/genomix/genomix-hyracks/src/main/resources/scripts/startCluster.sh
new file mode 100755
index 0000000..843a6b1
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/startCluster.sh
@@ -0,0 +1,4 @@
+bin/startcc.sh
+sleep 5
+bin/startAllNCs.sh
+
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/startDebugNc.sh b/genomix/genomix-hyracks/src/main/resources/scripts/startDebugNc.sh
new file mode 100644
index 0000000..c335475
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/startDebugNc.sh
@@ -0,0 +1,50 @@
+hostname
+
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CURRENT_PATH=`pwd`
+CCHOST=`ssh ${CCHOST_NAME} "cd ${CURRENT_PATH}; bin/getip.sh"`
+
+#Import cluster properties
+. conf/cluster.properties
+. conf/debugnc.properties
+
+#Clean up temp dir
+
+#rm -rf $NCTMP_DIR2
+mkdir $NCTMP_DIR2
+
+#Clean up log dir
+#rm -rf $NCLOGS_DIR2
+mkdir $NCLOGS_DIR2
+
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS2 | tr "," "\n")
+for io_dir in $io_dirs
+do
+ #rm -rf $io_dir
+ mkdir $io_dir
+done
+
+#Set JAVA_HOME
+export JAVA_HOME=$JAVA_HOME
+
+#Get OS
+IPADDR=`bin/getip.sh`
+
+#Get node ID
+NODEID=`hostname | cut -d '.' -f 1`
+NODEID=${NODEID}2
+
+#Set JAVA_OPTS
+export JAVA_OPTS=$NCJAVA_OPTS2
+
+cd $HYRACKS_HOME
+HYRACKS_HOME=`pwd`
+
+#Enter the temp dir
+cd $NCTMP_DIR2
+
+#Launch hyracks nc
+$HYRACKS_HOME/hyracks-server/target/appassembler/bin/hyracksnc -cc-host $CCHOST -cc-port $CC_CLUSTERPORT -cluster-net-ip-address $IPADDR -data-ip-address $IPADDR -node-id $NODEID -iodevices "${IO_DIRS2}" &> $NCLOGS_DIR2/$NODEID.log &
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/startcc.sh b/genomix/genomix-hyracks/src/main/resources/scripts/startcc.sh
new file mode 100644
index 0000000..67023c1
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/startcc.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+hostname
+
+#Import cluster properties
+. conf/cluster.properties
+
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CCHOST=`bin/getip.sh`
+
+#Remove the temp dir
+#rm -rf $CCTMP_DIR
+mkdir -p $CCTMP_DIR
+
+#Remove the logs dir
+#rm -rf $CCLOGS_DIR
+mkdir -p $CCLOGS_DIR
+
+#Export JAVA_HOME and JAVA_OPTS
+export JAVA_HOME=$JAVA_HOME
+export JAVA_OPTS=$CCJAVA_OPTS
+
+GENOMIX_HOME=`pwd`
+cd $CCTMP_DIR
+#Launch hyracks cc script
+${GENOMIX_HOME}/bin/genomixcc -client-net-ip-address $CCHOST -cluster-net-ip-address $CCHOST -client-net-port $CC_CLIENTPORT -cluster-net-port $CC_CLUSTERPORT -max-heartbeat-lapse-periods 999999 -default-max-job-attempts 0 -job-history-size 3 &> $CCLOGS_DIR/cc.log &
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/startnc.sh b/genomix/genomix-hyracks/src/main/resources/scripts/startnc.sh
new file mode 100644
index 0000000..bfcc1d4
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/startnc.sh
@@ -0,0 +1,48 @@
+hostname
+
+MY_NAME=`hostname`
+#Get the IP address of the cc
+CCHOST_NAME=`cat conf/master`
+CURRENT_PATH=`pwd`
+CCHOST=`ssh ${CCHOST_NAME} "cd ${CURRENT_PATH}; bin/getip.sh"`
+
+#Import cluster properties
+. conf/cluster.properties
+
+#Clean up temp dir
+
+#rm -rf $NCTMP_DIR
+mkdir -p $NCTMP_DIR
+
+#Clean up log dir
+#rm -rf $NCLOGS_DIR
+mkdir -p $NCLOGS_DIR
+
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS | tr "," "\n")
+for io_dir in $io_dirs
+do
+ #rm -rf $io_dir
+ mkdir -p $io_dir
+done
+
+#Set JAVA_HOME
+export JAVA_HOME=$JAVA_HOME
+
+IPADDR=`bin/getip.sh`
+#echo $IPADDR
+
+#Get node ID
+NODEID=`hostname | cut -d '.' -f 1`
+
+#Set JAVA_OPTS
+export JAVA_OPTS=$NCJAVA_OPTS
+
+GENOMIX_HOME=`pwd`
+
+#Enter the temp dir
+cd $NCTMP_DIR
+
+#Launch hyracks nc
+${GENOMIX_HOME}/bin/genomixnc -cc-host $CCHOST -cc-port $CC_CLUSTERPORT -cluster-net-ip-address $IPADDR -data-ip-address $IPADDR -result-ip-address $IPADDR -node-id $NODEID -iodevices "${IO_DIRS}" &> $NCLOGS_DIR/$NODEID.log &
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/stopAllNCs.sh b/genomix/genomix-hyracks/src/main/resources/scripts/stopAllNCs.sh
new file mode 100644
index 0000000..66ed866
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/stopAllNCs.sh
@@ -0,0 +1,6 @@
+GENOMIX_PATH=`pwd`
+
+for i in `cat conf/slaves`
+do
+ ssh $i "cd ${GENOMIX_PATH}; bin/stopnc.sh"
+done
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/stopCluster.sh b/genomix/genomix-hyracks/src/main/resources/scripts/stopCluster.sh
new file mode 100644
index 0000000..4889934
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/stopCluster.sh
@@ -0,0 +1,3 @@
+bin/stopAllNCs.sh
+sleep 2
+bin/stopcc.sh
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/stopcc.sh b/genomix/genomix-hyracks/src/main/resources/scripts/stopcc.sh
new file mode 100644
index 0000000..6d1b2d2
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/stopcc.sh
@@ -0,0 +1,20 @@
+hostname
+. conf/cluster.properties
+
+#Kill process
+PID=`ps -ef|grep ${USER}|grep java|grep 'Dapp.name=genomixcc'|awk '{print $2}'`
+
+if [ "$PID" == "" ]; then
+ PID=`ps -ef|grep ${USER}|grep java|grep 'hyracks'|awk '{print $2}'`
+fi
+
+if [ "$PID" == "" ]; then
+ USERID=`id | sed 's/^uid=//;s/(.*$//'`
+ PID=`ps -ef|grep ${USERID}|grep java|grep 'Dapp.name=genomixcc'|awk '{print $2}'`
+fi
+
+echo $PID
+kill -9 $PID
+
+#Clean up CC temp dir
+rm -rf $CCTMP_DIR/*
diff --git a/genomix/genomix-hyracks/src/main/resources/scripts/stopnc.sh b/genomix/genomix-hyracks/src/main/resources/scripts/stopnc.sh
new file mode 100644
index 0000000..092e232
--- /dev/null
+++ b/genomix/genomix-hyracks/src/main/resources/scripts/stopnc.sh
@@ -0,0 +1,31 @@
+hostname
+. conf/cluster.properties
+
+#Kill process
+PID=`ps -ef|grep ${USER}|grep java|grep 'Dapp.name=genomixnc'|awk '{print $2}'`
+
+if [ "$PID" == "" ]; then
+ PID=`ps -ef|grep ${USER}|grep java|grep 'hyracks'|awk '{print $2}'`
+fi
+
+if [ "$PID" == "" ]; then
+ PID=`ps -ef|grep ${USER}|grep java|grep 'hyracks'|awk '{print $2}'`
+fi
+
+if [ "$PID" == "" ]; then
+ USERID=`id | sed 's/^uid=//;s/(.*$//'`
+ PID=`ps -ef|grep ${USERID}|grep java|grep 'Dapp.name=genomixnc'|awk '{print $2}'`
+fi
+
+echo $PID
+kill -9 $PID
+
+#Clean up I/O working dir
+io_dirs=$(echo $IO_DIRS | tr "," "\n")
+for io_dir in $io_dirs
+do
+ rm -rf $io_dir/*
+done
+
+#Clean up NC temp dir
+rm -rf $NCTMP_DIR/*
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
new file mode 100644
index 0000000..847272a
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/JobRunTest.java
@@ -0,0 +1,246 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.example.jobrun;
+
+import java.io.BufferedWriter;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import junit.framework.Assert;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.driver.Driver;
+import edu.uci.ics.genomix.driver.Driver.Plan;
+import edu.uci.ics.genomix.job.GenomixJob;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+public class JobRunTest {
+ private static final String ACTUAL_RESULT_DIR = "actual";
+ private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf";
+
+ private static final String DATA_PATH = "src/test/resources/data/webmap/text.txt";
+ private static final String HDFS_INPUT_PATH = "/webmap";
+ private static final String HDFS_OUTPUT_PATH = "/webmap_result";
+
+ private static final String DUMPED_RESULT = ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH + "/merged.txt";
+ private static final String CONVERT_RESULT = DUMPED_RESULT + ".txt";
+ private static final String EXPECTED_PATH = "src/test/resources/expected/result2";
+ private static final String EXPECTED_REVERSE_PATH = "src/test/resources/expected/result_reverse";
+
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR + File.separator + "conf.xml";
+ private MiniDFSCluster dfsCluster;
+
+ private JobConf conf = new JobConf();
+ private int numberOfNC = 2;
+ private int numPartitionPerMachine = 1;
+
+ private Driver driver;
+
+ @Before
+ public void setUp() throws Exception {
+ cleanupStores();
+ edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.init();
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHDFS();
+
+ FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
+ FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
+
+ conf.setInt(GenomixJob.KMER_LENGTH, 5);
+ driver = new Driver(edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.CC_HOST,
+ edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT, numPartitionPerMachine);
+ }
+
+ private void cleanupStores() throws IOException {
+ FileUtils.forceMkdir(new File("teststore"));
+ FileUtils.forceMkdir(new File("build"));
+ FileUtils.cleanDirectory(new File("teststore"));
+ FileUtils.cleanDirectory(new File("build"));
+ }
+
+ private void startHDFS() throws IOException {
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
+
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, numberOfNC, true, null);
+ FileSystem dfs = FileSystem.get(conf);
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_INPUT_PATH);
+ dfs.mkdirs(dest);
+ //dfs.mkdirs(result);
+ dfs.copyFromLocalFile(src, dest);
+
+ DataOutputStream confOutput = new DataOutputStream(new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ private void cleanUpReEntry() throws IOException {
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ if (lfs.exists(new Path(DUMPED_RESULT))) {
+ lfs.delete(new Path(DUMPED_RESULT), true);
+ }
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(new Path(HDFS_OUTPUT_PATH))) {
+ dfs.delete(new Path(HDFS_OUTPUT_PATH), true);
+ }
+ }
+
+ @Test
+ public void TestAll() throws Exception {
+ cleanUpReEntry();
+ TestExternalGroupby();
+ cleanUpReEntry();
+ TestPreClusterGroupby();
+ cleanUpReEntry();
+ TestHybridGroupby();
+ cleanUpReEntry();
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ TestExternalReversedGroupby();
+ cleanUpReEntry();
+ TestPreClusterReversedGroupby();
+ cleanUpReEntry();
+ TestHybridReversedGroupby();
+ }
+
+ public void TestExternalGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "external");
+ System.err.println("Testing ExternalGroupBy");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
+
+ public void TestPreClusterGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+ System.err.println("Testing PreClusterGroupBy");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
+
+ public void TestHybridGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+ System.err.println("Testing HybridGroupBy");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_PATH));
+ }
+
+ public void TestExternalReversedGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "external");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing ExternalGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
+
+ public void TestPreClusterReversedGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "precluster");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing PreclusterGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
+
+ public void TestHybridReversedGroupby() throws Exception {
+ conf.set(GenomixJob.GROUPBY_TYPE, "hybrid");
+ conf.setBoolean(GenomixJob.REVERSED_KMER, true);
+ System.err.println("Testing HybridGroupBy + Reversed");
+ driver.runJob(new GenomixJob(conf), Plan.BUILD_DEBRUJIN_GRAPH, true);
+ Assert.assertEquals(true, checkResults(EXPECTED_REVERSE_PATH));
+ }
+
+ private boolean checkResults(String expectedPath) throws Exception {
+ File dumped = null;
+ String format = conf.get(GenomixJob.OUTPUT_FORMAT);
+ if ("text".equalsIgnoreCase(format)) {
+ FileUtil.copyMerge(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH),
+ FileSystem.getLocal(new Configuration()), new Path(DUMPED_RESULT), false, conf, null);
+ dumped = new File(DUMPED_RESULT);
+ } else {
+
+ FileSystem.getLocal(new Configuration()).mkdirs(new Path(ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH));
+ File filePathTo = new File(CONVERT_RESULT);
+ BufferedWriter bw = new BufferedWriter(new FileWriter(filePathTo));
+ for (int i = 0; i < numPartitionPerMachine * numberOfNC; i++) {
+ String partname = "/part-" + i;
+ // FileUtil.copy(FileSystem.get(conf), new Path(HDFS_OUTPUT_PATH
+ // + partname), FileSystem.getLocal(new Configuration()),
+ // new Path(ACTUAL_RESULT_DIR + HDFS_OUTPUT_PATH + partname), false, conf);
+
+ Path path = new Path(HDFS_OUTPUT_PATH + partname);
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.getFileStatus(path).getLen() == 0) {
+ continue;
+ }
+ SequenceFile.Reader reader = new SequenceFile.Reader(dfs, path, conf);
+
+ // KmerBytesWritable key = (KmerBytesWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+ KmerBytesWritable key = new KmerBytesWritable(conf.getInt(GenomixJob.KMER_LENGTH,
+ GenomixJob.DEFAULT_KMER));
+ KmerCountValue value = (KmerCountValue) ReflectionUtils.newInstance(reader.getValueClass(), conf);
+
+ while (reader.next(key, value)) {
+ if (key == null || value == null) {
+ break;
+ }
+ bw.write(key.toString() + "\t" + value.toString());
+ System.out.println(key.toString() + "\t" + value.toString());
+ bw.newLine();
+ }
+ reader.close();
+ }
+ bw.close();
+ dumped = new File(CONVERT_RESULT);
+ }
+
+ TestUtils.compareWithSortedResult(new File(expectedPath), dumped);
+ return true;
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ edu.uci.ics.hyracks.hdfs.utils.HyracksUtils.deinit();
+ cleanupHDFS();
+ }
+
+ private void cleanupHDFS() throws Exception {
+ dfsCluster.shutdown();
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/TestUtils.java b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/TestUtils.java
new file mode 100644
index 0000000..aa1f791
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/java/edu/uci/ics/genomix/example/jobrun/TestUtils.java
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.genomix.example.jobrun;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.Collections;
+
+public class TestUtils {
+ /**
+ * Compare with the sorted expected file.
+ * The actual file may not be sorted;
+ *
+ * @param expectedFile
+ * @param actualFile
+ */
+ public static void compareWithSortedResult(File expectedFile, File actualFile) throws Exception {
+ BufferedReader readerActual = new BufferedReader(new FileReader(actualFile));
+ BufferedReader readerExpected = new BufferedReader(new FileReader(expectedFile));
+ ArrayList<String> actualLines = new ArrayList<String>();
+ String lineExpected, lineActual;
+ try {
+ while ((lineActual = readerActual.readLine()) != null) {
+ actualLines.add(lineActual);
+ }
+ Collections.sort(actualLines);
+ int num = 1;
+ for (String actualLine : actualLines) {
+ lineExpected = readerExpected.readLine();
+ if (lineExpected == null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< " + actualLine + "\n> ");
+ }
+ if (!equalStrings(lineExpected, actualLine)) {
+ throw new Exception("Result for changed at line " + num + ":\n< " + lineExpected + "\n> "
+ + actualLine);
+ }
+ ++num;
+ }
+ lineExpected = readerExpected.readLine();
+ if (lineExpected != null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< \n> " + lineExpected);
+ }
+ } finally {
+ readerActual.close();
+ readerExpected.close();
+ }
+ }
+
+ public static void compareWithResult(File expectedFile, File actualFile) throws Exception {
+ BufferedReader readerExpected = new BufferedReader(new FileReader(expectedFile));
+ BufferedReader readerActual = new BufferedReader(new FileReader(actualFile));
+ String lineExpected, lineActual;
+ int num = 1;
+ try {
+ while ((lineExpected = readerExpected.readLine()) != null) {
+ lineActual = readerActual.readLine();
+ // Assert.assertEquals(lineExpected, lineActual);
+ if (lineActual == null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< " + lineExpected + "\n> ");
+ }
+ if (!equalStrings(lineExpected, lineActual)) {
+ throw new Exception("Result for changed at line " + num + ":\n< " + lineExpected + "\n> "
+ + lineActual);
+ }
+ ++num;
+ }
+ lineActual = readerActual.readLine();
+ if (lineActual != null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< \n> " + lineActual);
+ }
+ } finally {
+ readerExpected.close();
+ readerActual.close();
+ }
+ }
+
+ private static boolean equalStrings(String s1, String s2) {
+ String[] rowsOne = s1.split("\n");
+ String[] rowsTwo = s2.split("\n");
+
+ if (rowsOne.length != rowsTwo.length)
+ return false;
+
+ for (int i = 0; i < rowsOne.length; i++) {
+ String row1 = rowsOne[i];
+ String row2 = rowsTwo[i];
+
+ if (row1.equals(row2))
+ continue;
+
+ String[] fields1 = row1.split(",");
+ String[] fields2 = row2.split(",");
+
+ for (int j = 0; j < fields1.length; j++) {
+ if (fields1[j].equals(fields2[j])) {
+ continue;
+ } else if (fields1[j].indexOf('.') < 0) {
+ return false;
+ } else {
+ fields1[j] = fields1[j].split("=")[1];
+ fields2[j] = fields2[j].split("=")[1];
+ Double double1 = Double.parseDouble(fields1[j]);
+ Double double2 = Double.parseDouble(fields2[j]);
+ float float1 = (float) double1.doubleValue();
+ float float2 = (float) double2.doubleValue();
+
+ if (Math.abs(float1 - float2) == 0)
+ continue;
+ else {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+}
diff --git a/genomix/genomix-hyracks/src/test/resources/data/0/text.txt b/genomix/genomix-hyracks/src/test/resources/data/0/text.txt
new file mode 100755
index 0000000..f63a141
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/data/0/text.txt
@@ -0,0 +1,4 @@
+@625E1AAXX100810:1:100:10000:10271/1
+AATAGAAG
++
+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
diff --git a/genomix/genomix-hyracks/src/test/resources/data/webmap/text.txt b/genomix/genomix-hyracks/src/test/resources/data/webmap/text.txt
new file mode 100755
index 0000000..9874fc2
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/data/webmap/text.txt
@@ -0,0 +1,8 @@
+@625E1AAXX100810:1:100:10000:10271/1
+AATAGAAG
+AATAGAAG
++
+EDBDB?BEEEDGGEGGGDGGGA>DG@GGD;GD@DG@F?<B<BFFD?
+AATAGAAG
+AATAGAAG
+AATAGAAG
\ No newline at end of file
diff --git a/genomix/genomix-hyracks/src/test/resources/expected/result2 b/genomix/genomix-hyracks/src/test/resources/expected/result2
new file mode 100755
index 0000000..9296453
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/expected/result2
@@ -0,0 +1,4 @@
+AATAG |A 5
+AGAAG T| 5
+ATAGA A|A 5
+TAGAA A|G 5
diff --git a/genomix/genomix-hyracks/src/test/resources/expected/result_reverse b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
new file mode 100644
index 0000000..cf2712d
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/expected/result_reverse
@@ -0,0 +1,8 @@
+AAGAT G|A 5
+AATAG |A 5
+AGAAG T| 5
+AGATA A|A 5
+ATAGA A|A 5
+GAAGA |T 5
+GATAA A| 5
+TAGAA A|G 5
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/core-site.xml b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/core-site.xml
new file mode 100644
index 0000000..3e5bacb
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/core-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+ <property>
+ <name>fs.default.name</name>
+ <value>hdfs://127.0.0.1:31888</value>
+ </property>
+ <property>
+ <name>hadoop.tmp.dir</name>
+ <value>/tmp/hadoop</value>
+ </property>
+
+
+</configuration>
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/hdfs-site.xml b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/hdfs-site.xml
new file mode 100644
index 0000000..b1b1902
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/hdfs-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+ <property>
+ <name>dfs.replication</name>
+ <value>1</value>
+ </property>
+
+ <property>
+ <name>dfs.block.size</name>
+ <value>65536</value>
+ </property>
+
+</configuration>
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/log4j.properties b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/log4j.properties
new file mode 100755
index 0000000..d5e6004
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/log4j.properties
@@ -0,0 +1,94 @@
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=FATAL,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+
+# Logging Threshold
+log4j.threshhold=FATAL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Rolling File Appender
+#
+
+#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
+#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Logfile size and and 30-day backups
+#log4j.appender.RFA.MaxFileSize=1MB
+#log4j.appender.RFA.MaxBackupIndex=30
+
+#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+#
+# FSNamesystem Audit logging
+# All audit events are logged at INFO level
+#
+log4j.logger.org.apache.hadoop.fs.FSNamesystem.audit=WARN
+
+# Custom Logging levels
+
+#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
+#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
+#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG
+
+# Jets3t library
+log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
diff --git a/genomix/genomix-hyracks/src/test/resources/hadoop/conf/mapred-site.xml b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/mapred-site.xml
new file mode 100644
index 0000000..525e7d5
--- /dev/null
+++ b/genomix/genomix-hyracks/src/test/resources/hadoop/conf/mapred-site.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+ <property>
+ <name>mapred.job.tracker</name>
+ <value>localhost:29007</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.map.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.reduce.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.max.split.size</name>
+ <value>2048</value>
+ </property>
+
+</configuration>
diff --git a/genomix/genomix-pregelix/data/result/.output.crc b/genomix/genomix-pregelix/data/result/.output.crc
new file mode 100644
index 0000000..c4995af
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/.output.crc
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/BridgePath b/genomix/genomix-pregelix/data/result/BridgePath
new file mode 100755
index 0000000..90f0a8a
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/BridgePath
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/CyclePath b/genomix/genomix-pregelix/data/result/CyclePath
new file mode 100755
index 0000000..0d50d01
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/CyclePath
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/LongPath b/genomix/genomix-pregelix/data/result/LongPath
new file mode 100755
index 0000000..b1040ab
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/LongPath
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/Path b/genomix/genomix-pregelix/data/result/Path
new file mode 100755
index 0000000..76b1a0e
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/Path
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/SimplePath b/genomix/genomix-pregelix/data/result/SimplePath
new file mode 100755
index 0000000..dfabc43
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/SimplePath
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/SinglePath b/genomix/genomix-pregelix/data/result/SinglePath
new file mode 100755
index 0000000..6329aa6
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/SinglePath
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/ThreeKmer b/genomix/genomix-pregelix/data/result/ThreeKmer
new file mode 100755
index 0000000..f0435c7
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/ThreeKmer
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/TreePath b/genomix/genomix-pregelix/data/result/TreePath
new file mode 100755
index 0000000..dc8d16c
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/TreePath
Binary files differ
diff --git a/genomix/genomix-pregelix/data/result/TwoKmer b/genomix/genomix-pregelix/data/result/TwoKmer
new file mode 100755
index 0000000..73024db
--- /dev/null
+++ b/genomix/genomix-pregelix/data/result/TwoKmer
Binary files differ
diff --git a/genomix/genomix-pregelix/graph/BridgePath b/genomix/genomix-pregelix/graph/BridgePath
new file mode 100644
index 0000000..c10ea60
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/BridgePath
@@ -0,0 +1,14 @@
+ACCCC C|G 1
+CCCCG A|T 1
+CTCCG A|T 1
+TTCCA T|C 2
+ACTCC C|G 1
+CCGTG CT| 2
+TCCAC T|CT 2
+CCACC T|C 1
+CCACT T|C 1
+CACCC C|C 1
+TTTCC |A 2
+CCCGT C|G 1
+TCCGT C|G 1
+CACTC C|C 1
diff --git a/genomix/genomix-pregelix/graph/BridgePath_out.ps b/genomix/genomix-pregelix/graph/BridgePath_out.ps
new file mode 100644
index 0000000..9f66e44
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/BridgePath_out.ps
@@ -0,0 +1,603 @@
+%!PS-Adobe-3.0
+%%Creator: graphviz version 2.26.3 (20100126.1600)
+%%Title: G
+%%Pages: (atend)
+%%BoundingBox: (atend)
+%%EndComments
+save
+%%BeginProlog
+/DotDict 200 dict def
+DotDict begin
+
+/setupLatin1 {
+mark
+/EncodingVector 256 array def
+ EncodingVector 0
+
+ISOLatin1Encoding 0 255 getinterval putinterval
+EncodingVector 45 /hyphen put
+
+% Set up ISO Latin 1 character encoding
+/starnetISO {
+ dup dup findfont dup length dict begin
+ { 1 index /FID ne { def }{ pop pop } ifelse
+ } forall
+ /Encoding EncodingVector def
+ currentdict end definefont
+} def
+/Times-Roman starnetISO def
+/Times-Italic starnetISO def
+/Times-Bold starnetISO def
+/Times-BoldItalic starnetISO def
+/Helvetica starnetISO def
+/Helvetica-Oblique starnetISO def
+/Helvetica-Bold starnetISO def
+/Helvetica-BoldOblique starnetISO def
+/Courier starnetISO def
+/Courier-Oblique starnetISO def
+/Courier-Bold starnetISO def
+/Courier-BoldOblique starnetISO def
+cleartomark
+} bind def
+
+%%BeginResource: procset graphviz 0 0
+/coord-font-family /Times-Roman def
+/default-font-family /Times-Roman def
+/coordfont coord-font-family findfont 8 scalefont def
+
+/InvScaleFactor 1.0 def
+/set_scale {
+ dup 1 exch div /InvScaleFactor exch def
+ scale
+} bind def
+
+% styles
+/solid { [] 0 setdash } bind def
+/dashed { [9 InvScaleFactor mul dup ] 0 setdash } bind def
+/dotted { [1 InvScaleFactor mul 6 InvScaleFactor mul] 0 setdash } bind def
+/invis {/fill {newpath} def /stroke {newpath} def /show {pop newpath} def} bind def
+/bold { 2 setlinewidth } bind def
+/filled { } bind def
+/unfilled { } bind def
+/rounded { } bind def
+/diagonals { } bind def
+
+% hooks for setting color
+/nodecolor { sethsbcolor } bind def
+/edgecolor { sethsbcolor } bind def
+/graphcolor { sethsbcolor } bind def
+/nopcolor {pop pop pop} bind def
+
+/beginpage { % i j npages
+ /npages exch def
+ /j exch def
+ /i exch def
+ /str 10 string def
+ npages 1 gt {
+ gsave
+ coordfont setfont
+ 0 0 moveto
+ (\() show i str cvs show (,) show j str cvs show (\)) show
+ grestore
+ } if
+} bind def
+
+/set_font {
+ findfont exch
+ scalefont setfont
+} def
+
+% draw text fitted to its expected width
+/alignedtext { % width text
+ /text exch def
+ /width exch def
+ gsave
+ width 0 gt {
+ [] 0 setdash
+ text stringwidth pop width exch sub text length div 0 text ashow
+ } if
+ grestore
+} def
+
+/boxprim { % xcorner ycorner xsize ysize
+ 4 2 roll
+ moveto
+ 2 copy
+ exch 0 rlineto
+ 0 exch rlineto
+ pop neg 0 rlineto
+ closepath
+} bind def
+
+/ellipse_path {
+ /ry exch def
+ /rx exch def
+ /y exch def
+ /x exch def
+ matrix currentmatrix
+ newpath
+ x y translate
+ rx ry scale
+ 0 0 1 0 360 arc
+ setmatrix
+} bind def
+
+/endpage { showpage } bind def
+/showpage { } def
+
+/layercolorseq
+ [ % layer color sequence - darkest to lightest
+ [0 0 0]
+ [.2 .8 .8]
+ [.4 .8 .8]
+ [.6 .8 .8]
+ [.8 .8 .8]
+ ]
+def
+
+/layerlen layercolorseq length def
+
+/setlayer {/maxlayer exch def /curlayer exch def
+ layercolorseq curlayer 1 sub layerlen mod get
+ aload pop sethsbcolor
+ /nodecolor {nopcolor} def
+ /edgecolor {nopcolor} def
+ /graphcolor {nopcolor} def
+} bind def
+
+/onlayer { curlayer ne {invis} if } def
+
+/onlayers {
+ /myupper exch def
+ /mylower exch def
+ curlayer mylower lt
+ curlayer myupper gt
+ or
+ {invis} if
+} def
+
+/curlayer 0 def
+
+%%EndResource
+%%EndProlog
+%%BeginSetup
+14 default-font-family set_font
+1 setmiterlimit
+% /arrowlength 10 def
+% /arrowwidth 5 def
+
+% make sure pdfmark is harmless for PS-interpreters other than Distiller
+/pdfmark where {pop} {userdict /pdfmark /cleartomark load put} ifelse
+% make '<<' and '>>' safe on PS Level 1 devices
+/languagelevel where {pop languagelevel}{1} ifelse
+2 lt {
+ userdict (<<) cvn ([) cvn load put
+ userdict (>>) cvn ([) cvn load put
+} if
+
+%%EndSetup
+setupLatin1
+%%Page: 1 1
+%%PageBoundingBox: 36 36 248 674
+%%PageOrientation: Portrait
+0 0 1 beginpage
+gsave
+36 36 212 638 boxprim clip newpath
+1 1 set_scale 0 rotate 40 41 translate
+% ACCCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 241 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 237.4 moveto 49 (ACCCC) alignedtext
+grestore
+% CCCCG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 167 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+21.5 163.4 moveto 51 (CCCCG) alignedtext
+grestore
+% ACCCC->CCCCG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 222.33 moveto
+47 214.26 47 204.65 47 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 195.67 moveto
+47 185.67 lineto
+43.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 195.67 moveto
+47 185.67 lineto
+43.5 195.67 lineto
+closepath stroke
+grestore
+% CCCGT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 93 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23.5 89.4 moveto 49 (CCCGT) alignedtext
+grestore
+% CCCCG->CCCGT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47.25 148.33 moveto
+47.36 140.26 47.49 130.65 47.61 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.11 121.71 moveto
+47.75 111.67 lineto
+44.11 121.62 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.11 121.71 moveto
+47.75 111.67 lineto
+44.11 121.62 lineto
+closepath stroke
+grestore
+% CCGTG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+102 19 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+76.5 15.4 moveto 51 (CCGTG) alignedtext
+grestore
+% CCCGT->CCGTG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 61.07 75.09 moveto
+67.59 66.15 75.6 55.18 82.8 45.31 curveto
+stroke
+0 0 0 edgecolor
+newpath 85.79 47.15 moveto
+88.86 37 lineto
+80.14 43.02 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 85.79 47.15 moveto
+88.86 37 lineto
+80.14 43.02 lineto
+closepath stroke
+grestore
+% CTCCG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+158 167 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+133.5 163.4 moveto 49 (CTCCG) alignedtext
+grestore
+% TCCGT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+157 93 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+133 89.4 moveto 48 (TCCGT) alignedtext
+grestore
+% CTCCG->TCCGT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 157.75 148.33 moveto
+157.64 140.26 157.51 130.65 157.39 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 160.89 121.62 moveto
+157.25 111.67 lineto
+153.89 121.71 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 160.89 121.62 moveto
+157.25 111.67 lineto
+153.89 121.71 lineto
+closepath stroke
+grestore
+% TCCGT->CCGTG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 143.69 75.09 moveto
+137.05 66.15 128.89 55.18 121.56 45.31 curveto
+stroke
+0 0 0 edgecolor
+newpath 124.16 42.94 moveto
+115.38 37 lineto
+118.54 47.12 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 124.16 42.94 moveto
+115.38 37 lineto
+118.54 47.12 lineto
+closepath stroke
+grestore
+% TTCCA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+102 537 44.05 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+78.5 533.4 moveto 47 (TTCCA) alignedtext
+grestore
+% TCCAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+102 463 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+78 459.4 moveto 48 (TCCAC) alignedtext
+grestore
+% TTCCA->TCCAC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 102 518.33 moveto
+102 510.26 102 500.65 102 491.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 105.5 491.67 moveto
+102 481.67 lineto
+98.5 491.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 105.5 491.67 moveto
+102 481.67 lineto
+98.5 491.67 lineto
+closepath stroke
+grestore
+% CCACC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 389 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 385.4 moveto 49 (CCACC) alignedtext
+grestore
+% TCCAC->CCACC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 88.69 445.09 moveto
+81.93 436 73.6 424.79 66.17 414.79 curveto
+stroke
+0 0 0 edgecolor
+newpath 68.97 412.69 moveto
+60.19 406.75 lineto
+63.35 416.86 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 68.97 412.69 moveto
+60.19 406.75 lineto
+63.35 416.86 lineto
+closepath stroke
+grestore
+% CCACT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+157 389 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+132.5 385.4 moveto 49 (CCACT) alignedtext
+grestore
+% TCCAC->CCACT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 115.31 445.09 moveto
+122.07 436 130.4 424.79 137.83 414.79 curveto
+stroke
+0 0 0 edgecolor
+newpath 140.65 416.86 moveto
+143.81 406.75 lineto
+135.03 412.69 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 140.65 416.86 moveto
+143.81 406.75 lineto
+135.03 412.69 lineto
+closepath stroke
+grestore
+% ACTCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+157 241 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+133 237.4 moveto 48 (ACTCC) alignedtext
+grestore
+% ACTCC->CTCCG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 157.25 222.33 moveto
+157.36 214.26 157.49 204.65 157.61 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 161.11 195.71 moveto
+157.75 185.67 lineto
+154.11 195.62 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 161.11 195.71 moveto
+157.75 185.67 lineto
+154.11 195.62 lineto
+closepath stroke
+grestore
+% CACCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 315 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 311.4 moveto 49 (CACCC) alignedtext
+grestore
+% CCACC->CACCC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 370.33 moveto
+47 362.26 47 352.65 47 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 343.67 moveto
+47 333.67 lineto
+43.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 343.67 moveto
+47 333.67 lineto
+43.5 343.67 lineto
+closepath stroke
+grestore
+% CACTC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+157 315 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+133 311.4 moveto 48 (CACTC) alignedtext
+grestore
+% CCACT->CACTC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 157 370.33 moveto
+157 362.26 157 352.65 157 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 160.5 343.67 moveto
+157 333.67 lineto
+153.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 160.5 343.67 moveto
+157 333.67 lineto
+153.5 343.67 lineto
+closepath stroke
+grestore
+% CACCC->ACCCC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 296.33 moveto
+47 288.26 47 278.65 47 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 269.67 moveto
+47 259.67 lineto
+43.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 269.67 moveto
+47 259.67 lineto
+43.5 269.67 lineto
+closepath stroke
+grestore
+% CACTC->ACTCC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 157 296.33 moveto
+157 288.26 157 278.65 157 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 160.5 269.67 moveto
+157 259.67 lineto
+153.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 160.5 269.67 moveto
+157 259.67 lineto
+153.5 269.67 lineto
+closepath stroke
+grestore
+% TTTCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+102 611 43.84 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+79 607.4 moveto 46 (TTTCC) alignedtext
+grestore
+% TTTCC->TTCCA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 102 592.33 moveto
+102 584.26 102 574.65 102 565.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 105.5 565.67 moveto
+102 555.67 lineto
+98.5 565.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 105.5 565.67 moveto
+102 555.67 lineto
+98.5 565.67 lineto
+closepath stroke
+grestore
+endpage
+showpage
+grestore
+%%PageTrailer
+%%EndPage: 1
+%%Trailer
+%%Pages: 1
+%%BoundingBox: 36 36 248 674
+end
+restore
+%%EOF
diff --git a/genomix/genomix-pregelix/graph/CyclePath b/genomix/genomix-pregelix/graph/CyclePath
new file mode 100644
index 0000000..db30e3a
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/CyclePath
@@ -0,0 +1,10 @@
+GCAAC |T 1
+CATCA T|A 1
+CTTCA A|T 1
+AACTT C|C 1
+ACTTC A|A 1
+TCAAC A|T 1
+ATCAA C|C 1
+TTCAT C|C 1
+CAACT GT|T 2
+TCATC T|A 1
diff --git a/genomix/genomix-pregelix/graph/CyclePath_out.ps b/genomix/genomix-pregelix/graph/CyclePath_out.ps
new file mode 100644
index 0000000..1b13ecb
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/CyclePath_out.ps
@@ -0,0 +1,489 @@
+%!PS-Adobe-3.0
+%%Creator: graphviz version 2.26.3 (20100126.1600)
+%%Title: G
+%%Pages: (atend)
+%%BoundingBox: (atend)
+%%EndComments
+save
+%%BeginProlog
+/DotDict 200 dict def
+DotDict begin
+
+/setupLatin1 {
+mark
+/EncodingVector 256 array def
+ EncodingVector 0
+
+ISOLatin1Encoding 0 255 getinterval putinterval
+EncodingVector 45 /hyphen put
+
+% Set up ISO Latin 1 character encoding
+/starnetISO {
+ dup dup findfont dup length dict begin
+ { 1 index /FID ne { def }{ pop pop } ifelse
+ } forall
+ /Encoding EncodingVector def
+ currentdict end definefont
+} def
+/Times-Roman starnetISO def
+/Times-Italic starnetISO def
+/Times-Bold starnetISO def
+/Times-BoldItalic starnetISO def
+/Helvetica starnetISO def
+/Helvetica-Oblique starnetISO def
+/Helvetica-Bold starnetISO def
+/Helvetica-BoldOblique starnetISO def
+/Courier starnetISO def
+/Courier-Oblique starnetISO def
+/Courier-Bold starnetISO def
+/Courier-BoldOblique starnetISO def
+cleartomark
+} bind def
+
+%%BeginResource: procset graphviz 0 0
+/coord-font-family /Times-Roman def
+/default-font-family /Times-Roman def
+/coordfont coord-font-family findfont 8 scalefont def
+
+/InvScaleFactor 1.0 def
+/set_scale {
+ dup 1 exch div /InvScaleFactor exch def
+ scale
+} bind def
+
+% styles
+/solid { [] 0 setdash } bind def
+/dashed { [9 InvScaleFactor mul dup ] 0 setdash } bind def
+/dotted { [1 InvScaleFactor mul 6 InvScaleFactor mul] 0 setdash } bind def
+/invis {/fill {newpath} def /stroke {newpath} def /show {pop newpath} def} bind def
+/bold { 2 setlinewidth } bind def
+/filled { } bind def
+/unfilled { } bind def
+/rounded { } bind def
+/diagonals { } bind def
+
+% hooks for setting color
+/nodecolor { sethsbcolor } bind def
+/edgecolor { sethsbcolor } bind def
+/graphcolor { sethsbcolor } bind def
+/nopcolor {pop pop pop} bind def
+
+/beginpage { % i j npages
+ /npages exch def
+ /j exch def
+ /i exch def
+ /str 10 string def
+ npages 1 gt {
+ gsave
+ coordfont setfont
+ 0 0 moveto
+ (\() show i str cvs show (,) show j str cvs show (\)) show
+ grestore
+ } if
+} bind def
+
+/set_font {
+ findfont exch
+ scalefont setfont
+} def
+
+% draw text fitted to its expected width
+/alignedtext { % width text
+ /text exch def
+ /width exch def
+ gsave
+ width 0 gt {
+ [] 0 setdash
+ text stringwidth pop width exch sub text length div 0 text ashow
+ } if
+ grestore
+} def
+
+/boxprim { % xcorner ycorner xsize ysize
+ 4 2 roll
+ moveto
+ 2 copy
+ exch 0 rlineto
+ 0 exch rlineto
+ pop neg 0 rlineto
+ closepath
+} bind def
+
+/ellipse_path {
+ /ry exch def
+ /rx exch def
+ /y exch def
+ /x exch def
+ matrix currentmatrix
+ newpath
+ x y translate
+ rx ry scale
+ 0 0 1 0 360 arc
+ setmatrix
+} bind def
+
+/endpage { showpage } bind def
+/showpage { } def
+
+/layercolorseq
+ [ % layer color sequence - darkest to lightest
+ [0 0 0]
+ [.2 .8 .8]
+ [.4 .8 .8]
+ [.6 .8 .8]
+ [.8 .8 .8]
+ ]
+def
+
+/layerlen layercolorseq length def
+
+/setlayer {/maxlayer exch def /curlayer exch def
+ layercolorseq curlayer 1 sub layerlen mod get
+ aload pop sethsbcolor
+ /nodecolor {nopcolor} def
+ /edgecolor {nopcolor} def
+ /graphcolor {nopcolor} def
+} bind def
+
+/onlayer { curlayer ne {invis} if } def
+
+/onlayers {
+ /myupper exch def
+ /mylower exch def
+ curlayer mylower lt
+ curlayer myupper gt
+ or
+ {invis} if
+} def
+
+/curlayer 0 def
+
+%%EndResource
+%%EndProlog
+%%BeginSetup
+14 default-font-family set_font
+1 setmiterlimit
+% /arrowlength 10 def
+% /arrowwidth 5 def
+
+% make sure pdfmark is harmless for PS-interpreters other than Distiller
+/pdfmark where {pop} {userdict /pdfmark /cleartomark load put} ifelse
+% make '<<' and '>>' safe on PS Level 1 devices
+/languagelevel where {pop languagelevel}{1} ifelse
+2 lt {
+ userdict (<<) cvn ([) cvn load put
+ userdict (>>) cvn ([) cvn load put
+} if
+
+%%EndSetup
+setupLatin1
+%%Page: 1 1
+%%PageBoundingBox: 36 36 175 748
+%%PageOrientation: Portrait
+0 0 1 beginpage
+gsave
+36 36 139 712 boxprim clip newpath
+1 1 set_scale 0 rotate 40 41 translate
+% GCAAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+83 685 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+57 681.4 moveto 52 (GCAAC) alignedtext
+grestore
+% CAACT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+83 611 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+58.5 607.4 moveto 49 (CAACT) alignedtext
+grestore
+% GCAAC->CAACT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 83 666.33 moveto
+83 658.26 83 648.65 83 639.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 86.5 639.67 moveto
+83 629.67 lineto
+79.5 639.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 86.5 639.67 moveto
+83 629.67 lineto
+79.5 639.67 lineto
+closepath stroke
+grestore
+% AACTT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+46 537 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+21.5 533.4 moveto 49 (AACTT) alignedtext
+grestore
+% CAACT->AACTT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 73.85 592.71 moveto
+69.58 584.17 64.41 573.83 59.69 564.38 curveto
+stroke
+0 0 0 edgecolor
+newpath 62.77 562.72 moveto
+55.17 555.34 lineto
+56.51 565.85 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 62.77 562.72 moveto
+55.17 555.34 lineto
+56.51 565.85 lineto
+closepath stroke
+grestore
+% CATCA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 167 44.05 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23.5 163.4 moveto 47 (CATCA) alignedtext
+grestore
+% ATCAA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 93 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23 89.4 moveto 48 (ATCAA) alignedtext
+grestore
+% CATCA->ATCAA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 148.33 moveto
+47 140.26 47 130.65 47 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 121.67 moveto
+47 111.67 lineto
+43.5 121.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 121.67 moveto
+47 111.67 lineto
+43.5 121.67 lineto
+closepath stroke
+grestore
+% TCAAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+83 19 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+58.5 15.4 moveto 49 (TCAAC) alignedtext
+grestore
+% ATCAA->TCAAC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 55.9 74.71 moveto
+60.05 66.17 65.08 55.83 69.68 46.38 curveto
+stroke
+0 0 0 edgecolor
+newpath 72.85 47.86 moveto
+74.08 37.34 lineto
+66.56 44.8 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 72.85 47.86 moveto
+74.08 37.34 lineto
+66.56 44.8 lineto
+closepath stroke
+grestore
+% CTTCA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 389 44.05 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23.5 385.4 moveto 47 (CTTCA) alignedtext
+grestore
+% TTCAT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 315 43.84 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+24 311.4 moveto 46 (TTCAT) alignedtext
+grestore
+% CTTCA->TTCAT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 370.33 moveto
+47 362.26 47 352.65 47 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 343.67 moveto
+47 333.67 lineto
+43.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 343.67 moveto
+47 333.67 lineto
+43.5 343.67 lineto
+closepath stroke
+grestore
+% TCATC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 241 43.84 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+24 237.4 moveto 46 (TCATC) alignedtext
+grestore
+% TTCAT->TCATC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 296.33 moveto
+47 288.26 47 278.65 47 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 269.67 moveto
+47 259.67 lineto
+43.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 269.67 moveto
+47 259.67 lineto
+43.5 269.67 lineto
+closepath stroke
+grestore
+% ACTTC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+47 463 44.05 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23.5 459.4 moveto 47 (ACTTC) alignedtext
+grestore
+% AACTT->ACTTC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 46.25 518.33 moveto
+46.36 510.26 46.49 500.65 46.61 491.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.11 491.71 moveto
+46.75 481.67 lineto
+43.11 491.62 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.11 491.71 moveto
+46.75 481.67 lineto
+43.11 491.62 lineto
+closepath stroke
+grestore
+% ACTTC->CTTCA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 444.33 moveto
+47 436.26 47 426.65 47 417.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 417.67 moveto
+47 407.67 lineto
+43.5 417.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 417.67 moveto
+47 407.67 lineto
+43.5 417.67 lineto
+closepath stroke
+grestore
+% TCAAC->CAACT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 90.67 37.32 moveto
+101.5 64.93 120 119.06 120 167 curveto
+120 463 120 463 120 463 curveto
+120 505.88 105.2 553.72 94.32 583.1 curveto
+stroke
+0 0 0 edgecolor
+newpath 90.96 582.09 moveto
+90.67 592.68 lineto
+97.5 584.58 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 90.96 582.09 moveto
+90.67 592.68 lineto
+97.5 584.58 lineto
+closepath stroke
+grestore
+% TCATC->CATCA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 47 222.33 moveto
+47 214.26 47 204.65 47 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 50.5 195.67 moveto
+47 185.67 lineto
+43.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 50.5 195.67 moveto
+47 185.67 lineto
+43.5 195.67 lineto
+closepath stroke
+grestore
+endpage
+showpage
+grestore
+%%PageTrailer
+%%EndPage: 1
+%%Trailer
+%%Pages: 1
+%%BoundingBox: 36 36 175 748
+end
+restore
+%%EOF
diff --git a/genomix/genomix-pregelix/graph/LongPath b/genomix/genomix-pregelix/graph/LongPath
new file mode 100644
index 0000000..82c0298
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/LongPath
@@ -0,0 +1,13 @@
+CTCAG C|T 1
+AGTAC C|G 1
+GGCCT |C 1
+ACGCC T|C 1
+CCTCA G|G 1
+CCCGG G| 1
+GCCTC G|A 1
+CAGTA T|C 1
+GTACG A|C 1
+GCCCG C|G 1
+CGCCC A|G 1
+TCAGT C|A 1
+TACGC G|C 1
diff --git a/genomix/genomix-pregelix/graph/Path b/genomix/genomix-pregelix/graph/Path
new file mode 100644
index 0000000..67d55ca
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/Path
@@ -0,0 +1,8 @@
+CTCAG C|T 1
+AGTAC C|G 1
+GGCCT |C 1
+CCTCA G|G 1
+GCCTC G|A 1
+CAGTA T|C 1
+GTACG A| 1
+TCAGT C|A 1
diff --git a/genomix/genomix-pregelix/graph/SimplePath b/genomix/genomix-pregelix/graph/SimplePath
new file mode 100644
index 0000000..2e0667e
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/SimplePath
@@ -0,0 +1,18 @@
+AGCAC C| 1
+AAGAC |A 1
+CAGCA A|C 1
+TCGCA A|T 1
+CGGCA G|A 1
+TATCG A|C 1
+CAAGA G|A 1
+ACAGC G|A 1
+ATCGC T|A 1
+GCGGC |A 1
+GCATC C| 1
+ATATC |G 1
+GCAAG G|A 1
+GACAG A|C 1
+CGCAT T|C 1
+GGCAA C|G 1
+AAGAA C| 1
+AGACA A|G 1
diff --git a/genomix/genomix-pregelix/graph/SimplePath_out.ps b/genomix/genomix-pregelix/graph/SimplePath_out.ps
new file mode 100644
index 0000000..3b3bf39
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/SimplePath_out.ps
@@ -0,0 +1,659 @@
+%!PS-Adobe-3.0
+%%Creator: graphviz version 2.26.3 (20100126.1600)
+%%Title: G
+%%Pages: (atend)
+%%BoundingBox: (atend)
+%%EndComments
+save
+%%BeginProlog
+/DotDict 200 dict def
+DotDict begin
+
+/setupLatin1 {
+mark
+/EncodingVector 256 array def
+ EncodingVector 0
+
+ISOLatin1Encoding 0 255 getinterval putinterval
+EncodingVector 45 /hyphen put
+
+% Set up ISO Latin 1 character encoding
+/starnetISO {
+ dup dup findfont dup length dict begin
+ { 1 index /FID ne { def }{ pop pop } ifelse
+ } forall
+ /Encoding EncodingVector def
+ currentdict end definefont
+} def
+/Times-Roman starnetISO def
+/Times-Italic starnetISO def
+/Times-Bold starnetISO def
+/Times-BoldItalic starnetISO def
+/Helvetica starnetISO def
+/Helvetica-Oblique starnetISO def
+/Helvetica-Bold starnetISO def
+/Helvetica-BoldOblique starnetISO def
+/Courier starnetISO def
+/Courier-Oblique starnetISO def
+/Courier-Bold starnetISO def
+/Courier-BoldOblique starnetISO def
+cleartomark
+} bind def
+
+%%BeginResource: procset graphviz 0 0
+/coord-font-family /Times-Roman def
+/default-font-family /Times-Roman def
+/coordfont coord-font-family findfont 8 scalefont def
+
+/InvScaleFactor 1.0 def
+/set_scale {
+ dup 1 exch div /InvScaleFactor exch def
+ scale
+} bind def
+
+% styles
+/solid { [] 0 setdash } bind def
+/dashed { [9 InvScaleFactor mul dup ] 0 setdash } bind def
+/dotted { [1 InvScaleFactor mul 6 InvScaleFactor mul] 0 setdash } bind def
+/invis {/fill {newpath} def /stroke {newpath} def /show {pop newpath} def} bind def
+/bold { 2 setlinewidth } bind def
+/filled { } bind def
+/unfilled { } bind def
+/rounded { } bind def
+/diagonals { } bind def
+
+% hooks for setting color
+/nodecolor { sethsbcolor } bind def
+/edgecolor { sethsbcolor } bind def
+/graphcolor { sethsbcolor } bind def
+/nopcolor {pop pop pop} bind def
+
+/beginpage { % i j npages
+ /npages exch def
+ /j exch def
+ /i exch def
+ /str 10 string def
+ npages 1 gt {
+ gsave
+ coordfont setfont
+ 0 0 moveto
+ (\() show i str cvs show (,) show j str cvs show (\)) show
+ grestore
+ } if
+} bind def
+
+/set_font {
+ findfont exch
+ scalefont setfont
+} def
+
+% draw text fitted to its expected width
+/alignedtext { % width text
+ /text exch def
+ /width exch def
+ gsave
+ width 0 gt {
+ [] 0 setdash
+ text stringwidth pop width exch sub text length div 0 text ashow
+ } if
+ grestore
+} def
+
+/boxprim { % xcorner ycorner xsize ysize
+ 4 2 roll
+ moveto
+ 2 copy
+ exch 0 rlineto
+ 0 exch rlineto
+ pop neg 0 rlineto
+ closepath
+} bind def
+
+/ellipse_path {
+ /ry exch def
+ /rx exch def
+ /y exch def
+ /x exch def
+ matrix currentmatrix
+ newpath
+ x y translate
+ rx ry scale
+ 0 0 1 0 360 arc
+ setmatrix
+} bind def
+
+/endpage { showpage } bind def
+/showpage { } def
+
+/layercolorseq
+ [ % layer color sequence - darkest to lightest
+ [0 0 0]
+ [.2 .8 .8]
+ [.4 .8 .8]
+ [.6 .8 .8]
+ [.8 .8 .8]
+ ]
+def
+
+/layerlen layercolorseq length def
+
+/setlayer {/maxlayer exch def /curlayer exch def
+ layercolorseq curlayer 1 sub layerlen mod get
+ aload pop sethsbcolor
+ /nodecolor {nopcolor} def
+ /edgecolor {nopcolor} def
+ /graphcolor {nopcolor} def
+} bind def
+
+/onlayer { curlayer ne {invis} if } def
+
+/onlayers {
+ /myupper exch def
+ /mylower exch def
+ curlayer mylower lt
+ curlayer myupper gt
+ or
+ {invis} if
+} def
+
+/curlayer 0 def
+
+%%EndResource
+%%EndProlog
+%%BeginSetup
+14 default-font-family set_font
+1 setmiterlimit
+% /arrowlength 10 def
+% /arrowwidth 5 def
+
+% make sure pdfmark is harmless for PS-interpreters other than Distiller
+/pdfmark where {pop} {userdict /pdfmark /cleartomark load put} ifelse
+% make '<<' and '>>' safe on PS Level 1 devices
+/languagelevel where {pop languagelevel}{1} ifelse
+2 lt {
+ userdict (<<) cvn ([) cvn load put
+ userdict (>>) cvn ([) cvn load put
+} if
+
+%%EndSetup
+setupLatin1
+%%Page: 1 1
+%%PageBoundingBox: 36 36 366 452
+%%PageOrientation: Portrait
+0 0 1 beginpage
+gsave
+36 36 330 416 boxprim clip newpath
+1 1 set_scale 0 rotate 40 41 translate
+% AAGAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 389 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22 385.4 moveto 52 (AAGAC) alignedtext
+grestore
+% AGACA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 315 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 311.4 moveto 51 (AGACA) alignedtext
+grestore
+% AAGAC->AGACA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 370.33 moveto
+48 362.26 48 352.65 48 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 343.67 moveto
+48 333.67 lineto
+44.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 343.67 moveto
+48 333.67 lineto
+44.5 343.67 lineto
+closepath stroke
+grestore
+% GACAG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 241 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22 237.4 moveto 52 (GACAG) alignedtext
+grestore
+% AGACA->GACAG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 296.33 moveto
+48 288.26 48 278.65 48 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 269.67 moveto
+48 259.67 lineto
+44.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 269.67 moveto
+48 259.67 lineto
+44.5 269.67 lineto
+closepath stroke
+grestore
+% CAGCA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 93 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 89.4 moveto 51 (CAGCA) alignedtext
+grestore
+% AGCAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 19 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 15.4 moveto 51 (AGCAC) alignedtext
+grestore
+% CAGCA->AGCAC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 74.33 moveto
+48 66.26 48 56.65 48 47.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 47.67 moveto
+48 37.67 lineto
+44.5 47.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 47.67 moveto
+48 37.67 lineto
+44.5 47.67 lineto
+closepath stroke
+grestore
+% TCGCA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 167 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+135.5 163.4 moveto 49 (TCGCA) alignedtext
+grestore
+% CGCAT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 93 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+135.5 89.4 moveto 49 (CGCAT) alignedtext
+grestore
+% TCGCA->CGCAT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 160 148.33 moveto
+160 140.26 160 130.65 160 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 163.5 121.67 moveto
+160 111.67 lineto
+156.5 121.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 163.5 121.67 moveto
+160 111.67 lineto
+156.5 121.67 lineto
+closepath stroke
+grestore
+% GCATC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 19 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+135.5 15.4 moveto 49 (GCATC) alignedtext
+grestore
+% CGCAT->GCATC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 160 74.33 moveto
+160 66.26 160 56.65 160 47.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 163.5 47.67 moveto
+160 37.67 lineto
+156.5 47.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 163.5 47.67 moveto
+160 37.67 lineto
+156.5 47.67 lineto
+closepath stroke
+grestore
+% CGGCA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+273 315 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+247 311.4 moveto 52 (CGGCA) alignedtext
+grestore
+% GGCAA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+273 241 48.79 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+246.5 237.4 moveto 53 (GGCAA) alignedtext
+grestore
+% CGGCA->GGCAA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 273 296.33 moveto
+273 288.26 273 278.65 273 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 276.5 269.67 moveto
+273 259.67 lineto
+269.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 276.5 269.67 moveto
+273 259.67 lineto
+269.5 269.67 lineto
+closepath stroke
+grestore
+% GCAAG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+273 167 48.79 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+246.5 163.4 moveto 53 (GCAAG) alignedtext
+grestore
+% GGCAA->GCAAG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 273 222.33 moveto
+273 214.26 273 204.65 273 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 276.5 195.67 moveto
+273 185.67 lineto
+269.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 276.5 195.67 moveto
+273 185.67 lineto
+269.5 195.67 lineto
+closepath stroke
+grestore
+% TATCG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 315 44.05 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+136.5 311.4 moveto 47 (TATCG) alignedtext
+grestore
+% ATCGC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 241 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+135.5 237.4 moveto 49 (ATCGC) alignedtext
+grestore
+% TATCG->ATCGC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 160 296.33 moveto
+160 288.26 160 278.65 160 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 163.5 269.67 moveto
+160 259.67 lineto
+156.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 163.5 269.67 moveto
+160 259.67 lineto
+156.5 269.67 lineto
+closepath stroke
+grestore
+% ATCGC->TCGCA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 160 222.33 moveto
+160 214.26 160 204.65 160 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 163.5 195.67 moveto
+160 185.67 lineto
+156.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 163.5 195.67 moveto
+160 185.67 lineto
+156.5 195.67 lineto
+closepath stroke
+grestore
+% CAAGA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+273 93 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+247 89.4 moveto 52 (CAAGA) alignedtext
+grestore
+% AAGAA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+273 19 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+247 15.4 moveto 52 (AAGAA) alignedtext
+grestore
+% CAAGA->AAGAA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 273 74.33 moveto
+273 66.26 273 56.65 273 47.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 276.5 47.67 moveto
+273 37.67 lineto
+269.5 47.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 276.5 47.67 moveto
+273 37.67 lineto
+269.5 47.67 lineto
+closepath stroke
+grestore
+% ACAGC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 167 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 163.4 moveto 51 (ACAGC) alignedtext
+grestore
+% ACAGC->CAGCA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 148.33 moveto
+48 140.26 48 130.65 48 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 121.67 moveto
+48 111.67 lineto
+44.5 121.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 121.67 moveto
+48 111.67 lineto
+44.5 121.67 lineto
+closepath stroke
+grestore
+% GCGGC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+273 389 49 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+246 385.4 moveto 54 (GCGGC) alignedtext
+grestore
+% GCGGC->CGGCA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 273 370.33 moveto
+273 362.26 273 352.65 273 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 276.5 343.67 moveto
+273 333.67 lineto
+269.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 276.5 343.67 moveto
+273 333.67 lineto
+269.5 343.67 lineto
+closepath stroke
+grestore
+% ATATC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 389 43.13 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+137.5 385.4 moveto 45 (ATATC) alignedtext
+grestore
+% ATATC->TATCG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 160 370.33 moveto
+160 362.26 160 352.65 160 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 163.5 343.67 moveto
+160 333.67 lineto
+156.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 163.5 343.67 moveto
+160 333.67 lineto
+156.5 343.67 lineto
+closepath stroke
+grestore
+% GCAAG->CAAGA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 273 148.33 moveto
+273 140.26 273 130.65 273 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 276.5 121.67 moveto
+273 111.67 lineto
+269.5 121.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 276.5 121.67 moveto
+273 111.67 lineto
+269.5 121.67 lineto
+closepath stroke
+grestore
+% GACAG->ACAGC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 222.33 moveto
+48 214.26 48 204.65 48 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 195.67 moveto
+48 185.67 lineto
+44.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 195.67 moveto
+48 185.67 lineto
+44.5 195.67 lineto
+closepath stroke
+grestore
+endpage
+showpage
+grestore
+%%PageTrailer
+%%EndPage: 1
+%%Trailer
+%%Pages: 1
+%%BoundingBox: 36 36 366 452
+end
+restore
+%%EOF
diff --git a/genomix/genomix-pregelix/graph/SinglePath b/genomix/genomix-pregelix/graph/SinglePath
new file mode 100644
index 0000000..02f42ba
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/SinglePath
@@ -0,0 +1,6 @@
+ACAAC G|A 1
+CAACA A|G 1
+ACAGT A| 1
+AACAG C|T 1
+GACAA A|C 1
+AGACA |A 1
diff --git a/genomix/genomix-pregelix/graph/SinglePath_out.ps b/genomix/genomix-pregelix/graph/SinglePath_out.ps
new file mode 100644
index 0000000..8371636
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/SinglePath_out.ps
@@ -0,0 +1,351 @@
+%!PS-Adobe-3.0
+%%Creator: graphviz version 2.26.3 (20100126.1600)
+%%Title: G
+%%Pages: (atend)
+%%BoundingBox: (atend)
+%%EndComments
+save
+%%BeginProlog
+/DotDict 200 dict def
+DotDict begin
+
+/setupLatin1 {
+mark
+/EncodingVector 256 array def
+ EncodingVector 0
+
+ISOLatin1Encoding 0 255 getinterval putinterval
+EncodingVector 45 /hyphen put
+
+% Set up ISO Latin 1 character encoding
+/starnetISO {
+ dup dup findfont dup length dict begin
+ { 1 index /FID ne { def }{ pop pop } ifelse
+ } forall
+ /Encoding EncodingVector def
+ currentdict end definefont
+} def
+/Times-Roman starnetISO def
+/Times-Italic starnetISO def
+/Times-Bold starnetISO def
+/Times-BoldItalic starnetISO def
+/Helvetica starnetISO def
+/Helvetica-Oblique starnetISO def
+/Helvetica-Bold starnetISO def
+/Helvetica-BoldOblique starnetISO def
+/Courier starnetISO def
+/Courier-Oblique starnetISO def
+/Courier-Bold starnetISO def
+/Courier-BoldOblique starnetISO def
+cleartomark
+} bind def
+
+%%BeginResource: procset graphviz 0 0
+/coord-font-family /Times-Roman def
+/default-font-family /Times-Roman def
+/coordfont coord-font-family findfont 8 scalefont def
+
+/InvScaleFactor 1.0 def
+/set_scale {
+ dup 1 exch div /InvScaleFactor exch def
+ scale
+} bind def
+
+% styles
+/solid { [] 0 setdash } bind def
+/dashed { [9 InvScaleFactor mul dup ] 0 setdash } bind def
+/dotted { [1 InvScaleFactor mul 6 InvScaleFactor mul] 0 setdash } bind def
+/invis {/fill {newpath} def /stroke {newpath} def /show {pop newpath} def} bind def
+/bold { 2 setlinewidth } bind def
+/filled { } bind def
+/unfilled { } bind def
+/rounded { } bind def
+/diagonals { } bind def
+
+% hooks for setting color
+/nodecolor { sethsbcolor } bind def
+/edgecolor { sethsbcolor } bind def
+/graphcolor { sethsbcolor } bind def
+/nopcolor {pop pop pop} bind def
+
+/beginpage { % i j npages
+ /npages exch def
+ /j exch def
+ /i exch def
+ /str 10 string def
+ npages 1 gt {
+ gsave
+ coordfont setfont
+ 0 0 moveto
+ (\() show i str cvs show (,) show j str cvs show (\)) show
+ grestore
+ } if
+} bind def
+
+/set_font {
+ findfont exch
+ scalefont setfont
+} def
+
+% draw text fitted to its expected width
+/alignedtext { % width text
+ /text exch def
+ /width exch def
+ gsave
+ width 0 gt {
+ [] 0 setdash
+ text stringwidth pop width exch sub text length div 0 text ashow
+ } if
+ grestore
+} def
+
+/boxprim { % xcorner ycorner xsize ysize
+ 4 2 roll
+ moveto
+ 2 copy
+ exch 0 rlineto
+ 0 exch rlineto
+ pop neg 0 rlineto
+ closepath
+} bind def
+
+/ellipse_path {
+ /ry exch def
+ /rx exch def
+ /y exch def
+ /x exch def
+ matrix currentmatrix
+ newpath
+ x y translate
+ rx ry scale
+ 0 0 1 0 360 arc
+ setmatrix
+} bind def
+
+/endpage { showpage } bind def
+/showpage { } def
+
+/layercolorseq
+ [ % layer color sequence - darkest to lightest
+ [0 0 0]
+ [.2 .8 .8]
+ [.4 .8 .8]
+ [.6 .8 .8]
+ [.8 .8 .8]
+ ]
+def
+
+/layerlen layercolorseq length def
+
+/setlayer {/maxlayer exch def /curlayer exch def
+ layercolorseq curlayer 1 sub layerlen mod get
+ aload pop sethsbcolor
+ /nodecolor {nopcolor} def
+ /edgecolor {nopcolor} def
+ /graphcolor {nopcolor} def
+} bind def
+
+/onlayer { curlayer ne {invis} if } def
+
+/onlayers {
+ /myupper exch def
+ /mylower exch def
+ curlayer mylower lt
+ curlayer myupper gt
+ or
+ {invis} if
+} def
+
+/curlayer 0 def
+
+%%EndResource
+%%EndProlog
+%%BeginSetup
+14 default-font-family set_font
+1 setmiterlimit
+% /arrowlength 10 def
+% /arrowwidth 5 def
+
+% make sure pdfmark is harmless for PS-interpreters other than Distiller
+/pdfmark where {pop} {userdict /pdfmark /cleartomark load put} ifelse
+% make '<<' and '>>' safe on PS Level 1 devices
+/languagelevel where {pop languagelevel}{1} ifelse
+2 lt {
+ userdict (<<) cvn ([) cvn load put
+ userdict (>>) cvn ([) cvn load put
+} if
+
+%%EndSetup
+setupLatin1
+%%Page: 1 1
+%%PageBoundingBox: 36 36 140 452
+%%PageOrientation: Portrait
+0 0 1 beginpage
+gsave
+36 36 104 416 boxprim clip newpath
+1 1 set_scale 0 rotate 40 41 translate
+% ACAAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 241 46.17 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23 237.4 moveto 50 (ACAAC) alignedtext
+grestore
+% CAACA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 167 46.17 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23 163.4 moveto 50 (CAACA) alignedtext
+grestore
+% ACAAC->CAACA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 222.33 moveto
+48 214.26 48 204.65 48 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 195.67 moveto
+48 185.67 lineto
+44.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 195.67 moveto
+48 185.67 lineto
+44.5 195.67 lineto
+closepath stroke
+grestore
+% AACAG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 93 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22 89.4 moveto 52 (AACAG) alignedtext
+grestore
+% CAACA->AACAG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 148.33 moveto
+48 140.26 48 130.65 48 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 121.67 moveto
+48 111.67 lineto
+44.5 121.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 121.67 moveto
+48 111.67 lineto
+44.5 121.67 lineto
+closepath stroke
+grestore
+% ACAGT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 19 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23.5 15.4 moveto 49 (ACAGT) alignedtext
+grestore
+% AACAG->ACAGT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 74.33 moveto
+48 66.26 48 56.65 48 47.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 47.67 moveto
+48 37.67 lineto
+44.5 47.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 47.67 moveto
+48 37.67 lineto
+44.5 47.67 lineto
+closepath stroke
+grestore
+% GACAA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 315 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22 311.4 moveto 52 (GACAA) alignedtext
+grestore
+% GACAA->ACAAC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 296.33 moveto
+48 288.26 48 278.65 48 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 269.67 moveto
+48 259.67 lineto
+44.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 269.67 moveto
+48 259.67 lineto
+44.5 269.67 lineto
+closepath stroke
+grestore
+% AGACA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 389 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 385.4 moveto 51 (AGACA) alignedtext
+grestore
+% AGACA->GACAA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 370.33 moveto
+48 362.26 48 352.65 48 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 343.67 moveto
+48 333.67 lineto
+44.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 343.67 moveto
+48 333.67 lineto
+44.5 343.67 lineto
+closepath stroke
+grestore
+endpage
+showpage
+grestore
+%%PageTrailer
+%%EndPage: 1
+%%Trailer
+%%Pages: 1
+%%BoundingBox: 36 36 140 452
+end
+restore
+%%EOF
diff --git a/genomix/genomix-pregelix/graph/ThreeKmer b/genomix/genomix-pregelix/graph/ThreeKmer
new file mode 100644
index 0000000..7ce9890
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/ThreeKmer
@@ -0,0 +1,3 @@
+CTCGG A|T 1
+ACTCG |G 1
+TCGGT C| 1
diff --git a/genomix/genomix-pregelix/graph/TreePath b/genomix/genomix-pregelix/graph/TreePath
new file mode 100644
index 0000000..0a3d5c6
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/TreePath
@@ -0,0 +1,29 @@
+CTAAA A|C 1
+GTAAC A|T 1
+CTCAG C|T 2
+GCTAT G|C 1
+AGTAC C|G 1
+GGCCT |CG 3
+ATCCC T| 1
+ACGCC T|C 1
+CCTCA G|G 2
+CCCGG G| 1
+CCTGG G|C 1
+GCCTC G|A 2
+CAGTA T|AC 2
+TAAAC C| 1
+ACTAA A|A 1
+AGTAA C|C 1
+TAACT G|A 1
+GTACG A|C 1
+GCCCG C|G 1
+CGCCC A|G 1
+TGGCT C|A 1
+TATCC C|C 1
+TCAGT C|A 2
+TACGC G|C 1
+CTGGC C|T 1
+CTATC G|C 1
+AACTA T|A 1
+GCCTG G|G 1
+GGCTA T|T 1
diff --git a/genomix/genomix-pregelix/graph/TreePath_out.ps b/genomix/genomix-pregelix/graph/TreePath_out.ps
new file mode 100644
index 0000000..a1e7da1
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/TreePath_out.ps
@@ -0,0 +1,1018 @@
+%!PS-Adobe-3.0
+%%Creator: graphviz version 2.26.3 (20100126.1600)
+%%Title: G
+%%Pages: (atend)
+%%BoundingBox: (atend)
+%%EndComments
+save
+%%BeginProlog
+/DotDict 200 dict def
+DotDict begin
+
+/setupLatin1 {
+mark
+/EncodingVector 256 array def
+ EncodingVector 0
+
+ISOLatin1Encoding 0 255 getinterval putinterval
+EncodingVector 45 /hyphen put
+
+% Set up ISO Latin 1 character encoding
+/starnetISO {
+ dup dup findfont dup length dict begin
+ { 1 index /FID ne { def }{ pop pop } ifelse
+ } forall
+ /Encoding EncodingVector def
+ currentdict end definefont
+} def
+/Times-Roman starnetISO def
+/Times-Italic starnetISO def
+/Times-Bold starnetISO def
+/Times-BoldItalic starnetISO def
+/Helvetica starnetISO def
+/Helvetica-Oblique starnetISO def
+/Helvetica-Bold starnetISO def
+/Helvetica-BoldOblique starnetISO def
+/Courier starnetISO def
+/Courier-Oblique starnetISO def
+/Courier-Bold starnetISO def
+/Courier-BoldOblique starnetISO def
+cleartomark
+} bind def
+
+%%BeginResource: procset graphviz 0 0
+/coord-font-family /Times-Roman def
+/default-font-family /Times-Roman def
+/coordfont coord-font-family findfont 8 scalefont def
+
+/InvScaleFactor 1.0 def
+/set_scale {
+ dup 1 exch div /InvScaleFactor exch def
+ scale
+} bind def
+
+% styles
+/solid { [] 0 setdash } bind def
+/dashed { [9 InvScaleFactor mul dup ] 0 setdash } bind def
+/dotted { [1 InvScaleFactor mul 6 InvScaleFactor mul] 0 setdash } bind def
+/invis {/fill {newpath} def /stroke {newpath} def /show {pop newpath} def} bind def
+/bold { 2 setlinewidth } bind def
+/filled { } bind def
+/unfilled { } bind def
+/rounded { } bind def
+/diagonals { } bind def
+
+% hooks for setting color
+/nodecolor { sethsbcolor } bind def
+/edgecolor { sethsbcolor } bind def
+/graphcolor { sethsbcolor } bind def
+/nopcolor {pop pop pop} bind def
+
+/beginpage { % i j npages
+ /npages exch def
+ /j exch def
+ /i exch def
+ /str 10 string def
+ npages 1 gt {
+ gsave
+ coordfont setfont
+ 0 0 moveto
+ (\() show i str cvs show (,) show j str cvs show (\)) show
+ grestore
+ } if
+} bind def
+
+/set_font {
+ findfont exch
+ scalefont setfont
+} def
+
+% draw text fitted to its expected width
+/alignedtext { % width text
+ /text exch def
+ /width exch def
+ gsave
+ width 0 gt {
+ [] 0 setdash
+ text stringwidth pop width exch sub text length div 0 text ashow
+ } if
+ grestore
+} def
+
+/boxprim { % xcorner ycorner xsize ysize
+ 4 2 roll
+ moveto
+ 2 copy
+ exch 0 rlineto
+ 0 exch rlineto
+ pop neg 0 rlineto
+ closepath
+} bind def
+
+/ellipse_path {
+ /ry exch def
+ /rx exch def
+ /y exch def
+ /x exch def
+ matrix currentmatrix
+ newpath
+ x y translate
+ rx ry scale
+ 0 0 1 0 360 arc
+ setmatrix
+} bind def
+
+/endpage { showpage } bind def
+/showpage { } def
+
+/layercolorseq
+ [ % layer color sequence - darkest to lightest
+ [0 0 0]
+ [.2 .8 .8]
+ [.4 .8 .8]
+ [.6 .8 .8]
+ [.8 .8 .8]
+ ]
+def
+
+/layerlen layercolorseq length def
+
+/setlayer {/maxlayer exch def /curlayer exch def
+ layercolorseq curlayer 1 sub layerlen mod get
+ aload pop sethsbcolor
+ /nodecolor {nopcolor} def
+ /edgecolor {nopcolor} def
+ /graphcolor {nopcolor} def
+} bind def
+
+/onlayer { curlayer ne {invis} if } def
+
+/onlayers {
+ /myupper exch def
+ /mylower exch def
+ curlayer mylower lt
+ curlayer myupper gt
+ or
+ {invis} if
+} def
+
+/curlayer 0 def
+
+%%EndResource
+%%EndProlog
+%%BeginSetup
+14 default-font-family set_font
+1 setmiterlimit
+% /arrowlength 10 def
+% /arrowwidth 5 def
+
+% make sure pdfmark is harmless for PS-interpreters other than Distiller
+/pdfmark where {pop} {userdict /pdfmark /cleartomark load put} ifelse
+% make '<<' and '>>' safe on PS Level 1 devices
+/languagelevel where {pop languagelevel}{1} ifelse
+2 lt {
+ userdict (<<) cvn ([) cvn load put
+ userdict (>>) cvn ([) cvn load put
+} if
+
+%%EndSetup
+setupLatin1
+%%Page: 1 1
+%%PageBoundingBox: 36 36 359 970
+%%PageOrientation: Portrait
+0 0 1 beginpage
+gsave
+36 36 323 934 boxprim clip newpath
+1 1 set_scale 0 rotate 40 41 translate
+% CTAAA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 93 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+135.5 89.4 moveto 49 (CTAAA) alignedtext
+grestore
+% TAAAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+160 19 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+135.5 15.4 moveto 49 (TAAAC) alignedtext
+grestore
+% CTAAA->TAAAC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 160 74.33 moveto
+160 66.26 160 56.65 160 47.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 163.5 47.67 moveto
+160 37.67 lineto
+156.5 47.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 163.5 47.67 moveto
+160 37.67 lineto
+156.5 47.67 lineto
+closepath stroke
+grestore
+% GTAAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+158 389 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+133.5 385.4 moveto 49 (GTAAC) alignedtext
+grestore
+% TAACT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+158 315 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+134 311.4 moveto 48 (TAACT) alignedtext
+grestore
+% GTAAC->TAACT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 158 370.33 moveto
+158 362.26 158 352.65 158 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 161.5 343.67 moveto
+158 333.67 lineto
+154.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 161.5 343.67 moveto
+158 333.67 lineto
+154.5 343.67 lineto
+closepath stroke
+grestore
+% AACTA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+159 241 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+134.5 237.4 moveto 49 (AACTA) alignedtext
+grestore
+% TAACT->AACTA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 158.25 296.33 moveto
+158.36 288.26 158.49 278.65 158.61 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 162.11 269.71 moveto
+158.75 259.67 lineto
+155.11 269.62 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 162.11 269.71 moveto
+158.75 259.67 lineto
+155.11 269.62 lineto
+closepath stroke
+grestore
+% CTCAG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+155 685 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+130.5 681.4 moveto 49 (CTCAG) alignedtext
+grestore
+% TCAGT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+155 611 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+131 607.4 moveto 48 (TCAGT) alignedtext
+grestore
+% CTCAG->TCAGT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 155 666.33 moveto
+155 658.26 155 648.65 155 639.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 158.5 639.67 moveto
+155 629.67 lineto
+151.5 639.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 158.5 639.67 moveto
+155 629.67 lineto
+151.5 639.67 lineto
+closepath stroke
+grestore
+% CAGTA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+155 537 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+130.5 533.4 moveto 49 (CAGTA) alignedtext
+grestore
+% TCAGT->CAGTA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 155 592.33 moveto
+155 584.26 155 574.65 155 565.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 158.5 565.67 moveto
+155 555.67 lineto
+151.5 565.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 158.5 565.67 moveto
+155 555.67 lineto
+151.5 565.67 lineto
+closepath stroke
+grestore
+% GCTAT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 463 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+243 459.4 moveto 48 (GCTAT) alignedtext
+grestore
+% CTATC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 389 43.84 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+244 385.4 moveto 46 (CTATC) alignedtext
+grestore
+% GCTAT->CTATC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 444.33 moveto
+267 436.26 267 426.65 267 417.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 417.67 moveto
+267 407.67 lineto
+263.5 417.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 417.67 moveto
+267 407.67 lineto
+263.5 417.67 lineto
+closepath stroke
+grestore
+% TATCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 315 43.84 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+244 311.4 moveto 46 (TATCC) alignedtext
+grestore
+% CTATC->TATCC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 370.33 moveto
+267 362.26 267 352.65 267 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 343.67 moveto
+267 333.67 lineto
+263.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 343.67 moveto
+267 333.67 lineto
+263.5 343.67 lineto
+closepath stroke
+grestore
+% AGTAC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 463 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23.5 459.4 moveto 49 (AGTAC) alignedtext
+grestore
+% GTACG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 389 46.17 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23 385.4 moveto 50 (GTACG) alignedtext
+grestore
+% AGTAC->GTACG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 444.33 moveto
+48 436.26 48 426.65 48 417.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 417.67 moveto
+48 407.67 lineto
+44.5 417.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 417.67 moveto
+48 407.67 lineto
+44.5 417.67 lineto
+closepath stroke
+grestore
+% TACGC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 315 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+23.5 311.4 moveto 49 (TACGC) alignedtext
+grestore
+% GTACG->TACGC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 370.33 moveto
+48 362.26 48 352.65 48 343.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 343.67 moveto
+48 333.67 lineto
+44.5 343.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 343.67 moveto
+48 333.67 lineto
+44.5 343.67 lineto
+closepath stroke
+grestore
+% GGCCT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+211 907 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+185 903.4 moveto 52 (GGCCT) alignedtext
+grestore
+% GCCTC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+155 833 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+130.5 829.4 moveto 49 (GCCTC) alignedtext
+grestore
+% GGCCT->GCCTC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 197.44 889.09 moveto
+190.57 880 182.09 868.79 174.52 858.79 curveto
+stroke
+0 0 0 edgecolor
+newpath 177.26 856.61 moveto
+168.43 850.75 lineto
+171.67 860.83 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 177.26 856.61 moveto
+168.43 850.75 lineto
+171.67 860.83 lineto
+closepath stroke
+grestore
+% GCCTG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 833 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+241 829.4 moveto 52 (GCCTG) alignedtext
+grestore
+% GGCCT->GCCTG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 224.56 889.09 moveto
+231.38 880.06 239.79 868.96 247.32 859.01 curveto
+stroke
+0 0 0 edgecolor
+newpath 250.13 861.09 moveto
+253.37 851 lineto
+244.55 856.87 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 250.13 861.09 moveto
+253.37 851 lineto
+244.55 856.87 lineto
+closepath stroke
+grestore
+% CCTCA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+155 759 44.76 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+131 755.4 moveto 48 (CCTCA) alignedtext
+grestore
+% GCCTC->CCTCA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 155 814.33 moveto
+155 806.26 155 796.65 155 787.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 158.5 787.67 moveto
+155 777.67 lineto
+151.5 787.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 158.5 787.67 moveto
+155 777.67 lineto
+151.5 787.67 lineto
+closepath stroke
+grestore
+% CCTGG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 759 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+241 755.4 moveto 52 (CCTGG) alignedtext
+grestore
+% GCCTG->CCTGG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 814.33 moveto
+267 806.26 267 796.65 267 787.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 787.67 moveto
+267 777.67 lineto
+263.5 787.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 787.67 moveto
+267 777.67 lineto
+263.5 787.67 lineto
+closepath stroke
+grestore
+% ACGCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 241 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 237.4 moveto 51 (ACGCC) alignedtext
+grestore
+% CGCCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 167 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22.5 163.4 moveto 51 (CGCCC) alignedtext
+grestore
+% ACGCC->CGCCC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 222.33 moveto
+48 214.26 48 204.65 48 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 195.67 moveto
+48 185.67 lineto
+44.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 195.67 moveto
+48 185.67 lineto
+44.5 195.67 lineto
+closepath stroke
+grestore
+% GCCCG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 93 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22 89.4 moveto 52 (GCCCG) alignedtext
+grestore
+% CGCCC->GCCCG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 148.33 moveto
+48 140.26 48 130.65 48 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 121.67 moveto
+48 111.67 lineto
+44.5 121.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 121.67 moveto
+48 111.67 lineto
+44.5 121.67 lineto
+closepath stroke
+grestore
+% CCTCA->CTCAG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 155 740.33 moveto
+155 732.26 155 722.65 155 713.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 158.5 713.67 moveto
+155 703.67 lineto
+151.5 713.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 158.5 713.67 moveto
+155 703.67 lineto
+151.5 713.67 lineto
+closepath stroke
+grestore
+% CTGGC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 685 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+241 681.4 moveto 52 (CTGGC) alignedtext
+grestore
+% CCTGG->CTGGC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 740.33 moveto
+267 732.26 267 722.65 267 713.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 713.67 moveto
+267 703.67 lineto
+263.5 713.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 713.67 moveto
+267 703.67 lineto
+263.5 713.67 lineto
+closepath stroke
+grestore
+% TGGCT
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 611 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+241.5 607.4 moveto 51 (TGGCT) alignedtext
+grestore
+% CTGGC->TGGCT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 666.33 moveto
+267 658.26 267 648.65 267 639.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 639.67 moveto
+267 629.67 lineto
+263.5 639.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 639.67 moveto
+267 629.67 lineto
+263.5 639.67 lineto
+closepath stroke
+grestore
+% CAGTA->AGTAC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 131.78 520.94 moveto
+116.5 510.37 96.3 496.4 79.56 484.83 curveto
+stroke
+0 0 0 edgecolor
+newpath 81.39 481.84 moveto
+71.17 479.03 lineto
+77.41 487.59 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 81.39 481.84 moveto
+71.17 479.03 lineto
+77.41 487.59 lineto
+closepath stroke
+grestore
+% AGTAA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+158 463 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+133.5 459.4 moveto 49 (AGTAA) alignedtext
+grestore
+% CAGTA->AGTAA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 155.76 518.33 moveto
+156.08 510.26 156.47 500.65 156.84 491.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 160.34 491.8 moveto
+157.24 481.67 lineto
+153.34 491.52 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 160.34 491.8 moveto
+157.24 481.67 lineto
+153.34 491.52 lineto
+closepath stroke
+grestore
+% AGTAA->GTAAC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 158 444.33 moveto
+158 436.26 158 426.65 158 417.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 161.5 417.67 moveto
+158 407.67 lineto
+154.5 417.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 161.5 417.67 moveto
+158 407.67 lineto
+154.5 417.67 lineto
+closepath stroke
+grestore
+% ACTAA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+159 167 45.96 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+134.5 163.4 moveto 49 (ACTAA) alignedtext
+grestore
+% ACTAA->CTAAA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 159.25 148.33 moveto
+159.36 140.26 159.49 130.65 159.61 121.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 163.11 121.71 moveto
+159.75 111.67 lineto
+156.11 121.62 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 163.11 121.71 moveto
+159.75 111.67 lineto
+156.11 121.62 lineto
+closepath stroke
+grestore
+% AACTA->ACTAA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 159 222.33 moveto
+159 214.26 159 204.65 159 195.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 162.5 195.67 moveto
+159 185.67 lineto
+155.5 195.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 162.5 195.67 moveto
+159 185.67 lineto
+155.5 195.67 lineto
+closepath stroke
+grestore
+% TACGC->ACGCC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 296.33 moveto
+48 288.26 48 278.65 48 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 269.67 moveto
+48 259.67 lineto
+44.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 269.67 moveto
+48 259.67 lineto
+44.5 269.67 lineto
+closepath stroke
+grestore
+% CCCGG
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+48 19 48.08 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+22 15.4 moveto 52 (CCCGG) alignedtext
+grestore
+% GCCCG->CCCGG
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 48 74.33 moveto
+48 66.26 48 56.65 48 47.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 51.5 47.67 moveto
+48 37.67 lineto
+44.5 47.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 51.5 47.67 moveto
+48 37.67 lineto
+44.5 47.67 lineto
+closepath stroke
+grestore
+% GGCTA
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 537 46.88 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+241.5 533.4 moveto 51 (GGCTA) alignedtext
+grestore
+% TGGCT->GGCTA
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 592.33 moveto
+267 584.26 267 574.65 267 565.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 565.67 moveto
+267 555.67 lineto
+263.5 565.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 565.67 moveto
+267 555.67 lineto
+263.5 565.67 lineto
+closepath stroke
+grestore
+% GGCTA->GCTAT
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 518.33 moveto
+267 510.26 267 500.65 267 491.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 491.67 moveto
+267 481.67 lineto
+263.5 491.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 491.67 moveto
+267 481.67 lineto
+263.5 491.67 lineto
+closepath stroke
+grestore
+% ATCCC
+gsave
+1 setlinewidth
+0 0 0 nodecolor
+267 241 44.05 18.38 ellipse_path stroke
+0 0 0 nodecolor
+14 /Times-Roman set_font
+243.5 237.4 moveto 47 (ATCCC) alignedtext
+grestore
+% TATCC->ATCCC
+gsave
+1 setlinewidth
+0 0 0 edgecolor
+newpath 267 296.33 moveto
+267 288.26 267 278.65 267 269.71 curveto
+stroke
+0 0 0 edgecolor
+newpath 270.5 269.67 moveto
+267 259.67 lineto
+263.5 269.67 lineto
+closepath fill
+1 setlinewidth
+solid
+0 0 0 edgecolor
+newpath 270.5 269.67 moveto
+267 259.67 lineto
+263.5 269.67 lineto
+closepath stroke
+grestore
+endpage
+showpage
+grestore
+%%PageTrailer
+%%EndPage: 1
+%%Trailer
+%%Pages: 1
+%%BoundingBox: 36 36 359 970
+end
+restore
+%%EOF
diff --git a/genomix/genomix-pregelix/graph/mergeTest/BridgePath b/genomix/genomix-pregelix/graph/mergeTest/BridgePath
new file mode 100644
index 0000000..0717611
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/BridgePath
@@ -0,0 +1,2 @@
+TTTCCACTCCGTG
+TTTCCACCCCGTG
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/graph/mergeTest/CyclePath b/genomix/genomix-pregelix/graph/mergeTest/CyclePath
new file mode 100644
index 0000000..04080f4
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/CyclePath
@@ -0,0 +1 @@
+GCAACTTCATCAACT
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/graph/mergeTest/LongPath b/genomix/genomix-pregelix/graph/mergeTest/LongPath
new file mode 100644
index 0000000..acd3c1a
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/LongPath
@@ -0,0 +1 @@
+GGCCTCAGTACGCCCGG
diff --git a/genomix/genomix-pregelix/graph/mergeTest/Path b/genomix/genomix-pregelix/graph/mergeTest/Path
new file mode 100644
index 0000000..f63bbcf
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/Path
@@ -0,0 +1 @@
+GGCCTCAGTACG
diff --git a/genomix/genomix-pregelix/graph/mergeTest/SimplePath b/genomix/genomix-pregelix/graph/mergeTest/SimplePath
new file mode 100644
index 0000000..80c03af
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/SimplePath
@@ -0,0 +1,3 @@
+ATATCGCATC
+AAGACAGCAC
+GCGGCAAGAA
diff --git a/genomix/genomix-pregelix/graph/mergeTest/SinglePath b/genomix/genomix-pregelix/graph/mergeTest/SinglePath
new file mode 100644
index 0000000..56ef5f8
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/SinglePath
@@ -0,0 +1 @@
+AGACAACAGT
diff --git a/genomix/genomix-pregelix/graph/mergeTest/ThreeKmer b/genomix/genomix-pregelix/graph/mergeTest/ThreeKmer
new file mode 100644
index 0000000..ec004fa
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/ThreeKmer
@@ -0,0 +1 @@
+ACTCGGT
diff --git a/genomix/genomix-pregelix/graph/mergeTest/TreePath b/genomix/genomix-pregelix/graph/mergeTest/TreePath
new file mode 100644
index 0000000..f3c13ce
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/TreePath
@@ -0,0 +1,3 @@
+GGCCTGGCTATCCC
+GGCCTCAGTAACTAAAC
+GGCCTCAGTACGCCCGG
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/graph/mergeTest/TwoKmer b/genomix/genomix-pregelix/graph/mergeTest/TwoKmer
new file mode 100644
index 0000000..86790c6
--- /dev/null
+++ b/genomix/genomix-pregelix/graph/mergeTest/TwoKmer
@@ -0,0 +1,2 @@
+ACACT |G 1
+CACTG A| 1
diff --git a/genomix/genomix-pregelix/pom.xml b/genomix/genomix-pregelix/pom.xml
new file mode 100644
index 0000000..854335f
--- /dev/null
+++ b/genomix/genomix-pregelix/pom.xml
@@ -0,0 +1,188 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>edu.uci.ics.pregelix</groupId>
+ <artifactId>genomix-pregelix</artifactId>
+ <packaging>jar</packaging>
+ <version>0.2.6-SNAPSHOT</version>
+ <name>genomix-pregelix</name>
+
+ <properties>
+ <jvm.extraargs/>
+ </properties>
+
+ <profiles>
+ <profile>
+ <id>macosx</id>
+ <activation>
+ <os>
+ <name>mac os x</name>
+ </os>
+ <jdk>1.7</jdk>
+ </activation>
+ <properties>
+ <jvm.extraargs>-Djava.nio.channels.spi.SelectorProvider=sun.nio.ch.KQueueSelectorProvider</jvm.extraargs>
+ </properties>
+ </profile>
+ </profiles>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.0.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ <fork>true</fork>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ <executions>
+ <execution>
+ <id>make-my-jar-with-dependencies</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>appassembler-maven-plugin</artifactId>
+ <version>1.3</version>
+ <executions>
+ <execution>
+ <configuration>
+ <programs>
+ <program>
+ <mainClass>edu.uci.ics.genomix.pregelix.example.Client</mainClass>
+ <name>pregelix</name>
+ </program>
+ </programs>
+ <repositoryLayout>flat</repositoryLayout>
+ <repositoryName>lib</repositoryName>
+ </configuration>
+ <phase>package</phase>
+ <goals>
+ <goal>assemble</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.7.2</version>
+ <configuration>
+ <forkMode>pertest</forkMode>
+ <argLine>-enableassertions -Xmx2047m -Dfile.encoding=UTF-8
+ -Djava.util.logging.config.file=src/test/resources/logging.properties</argLine>
+ <includes>
+ <include>**/*TestSuite.java</include>
+ <include>**/*Test.java</include>
+ </includes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-clean-plugin</artifactId>
+ <configuration>
+ <filesets>
+ <fileset>
+ <directory>.</directory>
+ <includes>
+ <include>teststore*</include>
+ <include>edu*</include>
+ <include>actual*</include>
+ <include>build*</include>
+ <include>expect*</include>
+ <include>ClusterController*</include>
+ <include>edu.uci.*</include>
+ </includes>
+ </fileset>
+ </filesets>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>pregelix-core</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>genomix-data</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ </dependencies>
+
+ <scm>
+ <connection>scm:svn:https://hyracks.googlecode.com/svn/trunk/fullstack/pregelix</connection>
+ <developerConnection>scm:svn:https://hyracks.googlecode.com/svn/trunk/fullstack/pregelix</developerConnection>
+ <url>http://code.google.com/p/hyracks/source/browse/#svn/trunk/fullstack/pregelix</url>
+ </scm>
+
+ <distributionManagement>
+ <repository>
+ <id>hyracks-releases</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/repositories/hyracks-releases/</url>
+ </repository>
+ <snapshotRepository>
+ <id>hyracks-snapshots</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/repositories/hyracks-snapshots/</url>
+ </snapshotRepository>
+ </distributionManagement>
+
+ <reporting>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-changelog-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </reporting>
+
+ <repositories>
+ <repository>
+ <id>hyracks-public</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/groups/hyracks-public/</url>
+ </repository>
+ <repository>
+ <id>jboss-public</id>
+ <url>https://repository.jboss.org/nexus/content/groups/public/</url>
+ </repository>
+ </repositories>
+
+ <pluginRepositories>
+ <pluginRepository>
+ <id>hyracks-public</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/groups/hyracks-public/</url>
+ <releases>
+ <updatePolicy>always</updatePolicy>
+ </releases>
+ </pluginRepository>
+ </pluginRepositories>
+</project>
+
+
diff --git a/genomix/genomix-pregelix/src/main/assembly/binary-assembly.xml b/genomix/genomix-pregelix/src/main/assembly/binary-assembly.xml
new file mode 100755
index 0000000..0500499
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/assembly/binary-assembly.xml
@@ -0,0 +1,19 @@
+<assembly>
+ <id>binary-assembly</id>
+ <formats>
+ <format>zip</format>
+ <format>dir</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <fileSets>
+ <fileSet>
+ <directory>target/appassembler/bin</directory>
+ <outputDirectory>bin</outputDirectory>
+ <fileMode>0755</fileMode>
+ </fileSet>
+ <fileSet>
+ <directory>target/appassembler/lib</directory>
+ <outputDirectory>lib</outputDirectory>
+ </fileSet>
+ </fileSets>
+</assembly>
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/api/io/binary/BinaryVertexInputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/api/io/binary/BinaryVertexInputFormat.java
new file mode 100644
index 0000000..e1868b1
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/api/io/binary/BinaryVertexInputFormat.java
@@ -0,0 +1,109 @@
+package edu.uci.ics.genomix.pregelix.api.io.binary;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+
+import edu.uci.ics.pregelix.api.io.VertexInputFormat;
+import edu.uci.ics.pregelix.api.io.VertexReader;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+public class BinaryVertexInputFormat <I extends WritableComparable<?>, V extends Writable, E extends Writable, M extends Writable>
+ extends VertexInputFormat<I, V, E, M>{
+
+ /** Uses the SequenceFileInputFormat to do everything */
+ @SuppressWarnings("rawtypes")
+ protected SequenceFileInputFormat binaryInputFormat = new SequenceFileInputFormat();
+
+ /**
+ * Abstract class to be implemented by the user based on their specific
+ * vertex input. Easiest to ignore the key value separator and only use key
+ * instead.
+ *
+ * @param <I>
+ * Vertex index value
+ * @param <V>
+ * Vertex value
+ * @param <E>
+ * Edge value
+ */
+ public static abstract class BinaryVertexReader<I extends WritableComparable<?>, V extends Writable, E extends Writable, M extends Writable>
+ implements VertexReader<I, V, E, M> {
+ /** Internal line record reader */
+ private final RecordReader<KmerBytesWritable,KmerCountValue> lineRecordReader;
+ /** Context passed to initialize */
+ private TaskAttemptContext context;
+
+ /**
+ * Initialize with the LineRecordReader.
+ *
+ * @param recordReader
+ * Line record reader from SequenceFileInputFormat
+ */
+ public BinaryVertexReader(RecordReader<KmerBytesWritable, KmerCountValue> recordReader) {
+ this.lineRecordReader = recordReader;
+ }
+
+ @Override
+ public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
+ InterruptedException {
+ lineRecordReader.initialize(inputSplit, context);
+ this.context = context;
+ }
+
+ @Override
+ public void close() throws IOException {
+ lineRecordReader.close();
+ }
+
+ @Override
+ public float getProgress() throws IOException, InterruptedException {
+ return lineRecordReader.getProgress();
+ }
+
+ /**
+ * Get the line record reader.
+ *
+ * @return Record reader to be used for reading.
+ */
+ protected RecordReader<KmerBytesWritable,KmerCountValue> getRecordReader() {
+ return lineRecordReader;
+ }
+
+ /**
+ * Get the context.
+ *
+ * @return Context passed to initialize.
+ */
+ protected TaskAttemptContext getContext() {
+ return context;
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public List<InputSplit> getSplits(JobContext context, int numWorkers) throws IOException, InterruptedException {
+ // Ignore the hint of numWorkers here since we are using SequenceFileInputFormat
+ // to do this for us
+ return binaryInputFormat.getSplits(context);
+ }
+
+ @Override
+ public VertexReader<I, V, E, M> createVertexReader(InputSplit split,
+ TaskAttemptContext context) throws IOException {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+
+
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/api/io/binary/BinaryVertexOutputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/api/io/binary/BinaryVertexOutputFormat.java
new file mode 100644
index 0000000..1435770
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/api/io/binary/BinaryVertexOutputFormat.java
@@ -0,0 +1,102 @@
+package edu.uci.ics.genomix.pregelix.api.io.binary;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.pregelix.api.io.VertexOutputFormat;
+import edu.uci.ics.pregelix.api.io.VertexWriter;
+
+/**
+ * Abstract class that users should subclass to use their own text based vertex
+ * output format.
+ *
+ * @param <I>
+ * Vertex index value
+ * @param <V>
+ * Vertex value
+ * @param <E>
+ * Edge value
+ */
+@SuppressWarnings("rawtypes")
+public abstract class BinaryVertexOutputFormat<I extends WritableComparable, V extends Writable, E extends Writable>
+ extends VertexOutputFormat<I, V, E> {
+ /** Uses the SequenceFileOutputFormat to do everything */
+ protected SequenceFileOutputFormat binaryOutputFormat = new SequenceFileOutputFormat();
+
+ /**
+ * Abstract class to be implemented by the user based on their specific
+ * vertex output. Easiest to ignore the key value separator and only use key
+ * instead.
+ *
+ * @param <I>
+ * Vertex index value
+ * @param <V>
+ * Vertex value
+ * @param <E>
+ * Edge value
+ */
+ public static abstract class BinaryVertexWriter<I extends WritableComparable, V extends Writable, E extends Writable>
+ implements VertexWriter<I, V, E> {
+ /** Context passed to initialize */
+ private TaskAttemptContext context;
+ /** Internal line record writer */
+ private final RecordWriter<KmerBytesWritable, ValueStateWritable> lineRecordWriter;
+
+ /**
+ * Initialize with the LineRecordWriter.
+ *
+ * @param lineRecordWriter
+ * Line record writer from SequenceFileOutputFormat
+ */
+ public BinaryVertexWriter(RecordWriter<KmerBytesWritable, ValueStateWritable> lineRecordWriter) {
+ this.lineRecordWriter = lineRecordWriter;
+ }
+
+ @Override
+ public void initialize(TaskAttemptContext context) throws IOException {
+ this.context = context;
+ }
+
+ @Override
+ public void close(TaskAttemptContext context) throws IOException, InterruptedException {
+ lineRecordWriter.close(context);
+ }
+
+ /**
+ * Get the line record writer.
+ *
+ * @return Record writer to be used for writing.
+ */
+ public RecordWriter<KmerBytesWritable, ValueStateWritable> getRecordWriter() {
+ return lineRecordWriter;
+ }
+
+ /**
+ * Get the context.
+ *
+ * @return Context passed to initialize.
+ */
+ public TaskAttemptContext getContext() {
+ return context;
+ }
+ }
+
+ @Override
+ public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
+ binaryOutputFormat.checkOutputSpecs(context);
+ }
+
+ @Override
+ public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
+ return binaryOutputFormat.getOutputCommitter(context);
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/client/Client.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/client/Client.java
new file mode 100644
index 0000000..60342a7
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/client/Client.java
@@ -0,0 +1,72 @@
+
+package edu.uci.ics.genomix.pregelix.client;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+
+import edu.uci.ics.genomix.pregelix.operator.LogAlgorithmForPathMergeVertex;
+import edu.uci.ics.genomix.pregelix.operator.NaiveAlgorithmForPathMergeVertex;
+import edu.uci.ics.pregelix.api.job.PregelixJob;
+import edu.uci.ics.pregelix.core.base.IDriver.Plan;
+import edu.uci.ics.pregelix.core.driver.Driver;
+
+public class Client {
+
+ private static class Options {
+ @Option(name = "-inputpaths", usage = "comma seprated input paths", required = true)
+ public String inputPaths;
+
+ @Option(name = "-outputpath", usage = "output path", required = true)
+ public String outputPath;
+
+ @Option(name = "-ip", usage = "ip address of cluster controller", required = true)
+ public String ipAddress;
+
+ @Option(name = "-port", usage = "port of cluster controller", required = false)
+ public int port;
+
+ @Option(name = "-plan", usage = "query plan choice", required = false)
+ public Plan planChoice = Plan.OUTER_JOIN;
+
+ @Option(name = "-kmer-size", usage = "the size of kmer", required = false)
+ public int sizeKmer;
+
+ @Option(name = "-num-iteration", usage = "max number of iterations, for pagerank job only", required = false)
+ public long numIteration = -1;
+
+ @Option(name = "-runtime-profiling", usage = "whether to do runtime profifling", required = false)
+ public String profiling = "false";
+ }
+
+ public static void run(String[] args, PregelixJob job) throws Exception {
+ Options options = prepareJob(args, job);
+ Driver driver = new Driver(Client.class);
+ driver.runJob(job, options.planChoice, options.ipAddress, options.port, Boolean.parseBoolean(options.profiling));
+ }
+
+ private static Options prepareJob(String[] args, PregelixJob job) throws CmdLineException, IOException {
+ Options options = new Options();
+ CmdLineParser parser = new CmdLineParser(options);
+ parser.parseArgument(args);
+
+ String[] inputs = options.inputPaths.split(";");
+ FileInputFormat.setInputPaths(job, inputs[0]);
+ for (int i = 1; i < inputs.length; i++)
+ FileInputFormat.addInputPaths(job, inputs[0]);
+ FileOutputFormat.setOutputPath(job, new Path(options.outputPath));
+ job.getConfiguration().setInt(NaiveAlgorithmForPathMergeVertex.KMER_SIZE, options.sizeKmer);
+ job.getConfiguration().setInt(LogAlgorithmForPathMergeVertex.KMER_SIZE, options.sizeKmer);
+ if (options.numIteration > 0){
+ job.getConfiguration().setLong(NaiveAlgorithmForPathMergeVertex.ITERATIONS, options.numIteration);
+ job.getConfiguration().setLong(LogAlgorithmForPathMergeVertex.ITERATIONS, options.numIteration);
+ }
+ return options;
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/LogAlgorithmForPathMergeInputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/LogAlgorithmForPathMergeInputFormat.java
new file mode 100644
index 0000000..7a7e43d
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/LogAlgorithmForPathMergeInputFormat.java
@@ -0,0 +1,79 @@
+package edu.uci.ics.genomix.pregelix.format;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import edu.uci.ics.genomix.pregelix.api.io.binary.BinaryVertexInputFormat;
+import edu.uci.ics.genomix.pregelix.io.LogAlgorithmMessageWritable;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.pregelix.type.State;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.pregelix.api.graph.Vertex;
+import edu.uci.ics.pregelix.api.io.VertexReader;
+import edu.uci.ics.pregelix.api.util.BspUtils;
+
+public class LogAlgorithmForPathMergeInputFormat extends
+ BinaryVertexInputFormat<KmerBytesWritable, ValueStateWritable, NullWritable, LogAlgorithmMessageWritable>{
+ /**
+ * Format INPUT
+ */
+ @SuppressWarnings("unchecked")
+ @Override
+ public VertexReader<KmerBytesWritable, ValueStateWritable, NullWritable, LogAlgorithmMessageWritable> createVertexReader(
+ InputSplit split, TaskAttemptContext context) throws IOException {
+ return new BinaryLoadGraphReader(binaryInputFormat.createRecordReader(split, context));
+ }
+
+ @SuppressWarnings("rawtypes")
+ class BinaryLoadGraphReader extends
+ BinaryVertexReader<KmerBytesWritable, ValueStateWritable, NullWritable, LogAlgorithmMessageWritable> {
+ private Vertex vertex = null;
+ private KmerBytesWritable vertexId = null;
+ private ValueStateWritable vertexValue = new ValueStateWritable();
+
+ public BinaryLoadGraphReader(RecordReader<KmerBytesWritable,KmerCountValue> recordReader) {
+ super(recordReader);
+ }
+
+ @Override
+ public boolean nextVertex() throws IOException, InterruptedException {
+ return getRecordReader().nextKeyValue();
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Vertex<KmerBytesWritable, ValueStateWritable, NullWritable, LogAlgorithmMessageWritable> getCurrentVertex() throws IOException,
+ InterruptedException {
+ if (vertex == null)
+ vertex = (Vertex) BspUtils.createVertex(getContext().getConfiguration());
+
+ vertex.getMsgList().clear();
+ vertex.getEdges().clear();
+
+ if(getRecordReader() != null){
+ /**
+ * set the src vertex id
+ */
+ if(vertexId == null)
+ vertexId = new KmerBytesWritable(getRecordReader().getCurrentKey().getKmerLength());
+ vertexId.set(getRecordReader().getCurrentKey());
+ vertex.setVertexId(vertexId);
+ /**
+ * set the vertex value
+ */
+ KmerCountValue kmerCountValue = getRecordReader().getCurrentValue();
+ vertexValue.setAdjMap(kmerCountValue.getAdjBitMap());
+ vertexValue.setState(State.NON_VERTEX);
+ vertex.setVertexValue(vertexValue);
+ }
+
+ return vertex;
+ }
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/LogAlgorithmForPathMergeOutputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/LogAlgorithmForPathMergeOutputFormat.java
new file mode 100644
index 0000000..8075238
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/LogAlgorithmForPathMergeOutputFormat.java
@@ -0,0 +1,46 @@
+package edu.uci.ics.genomix.pregelix.format;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import edu.uci.ics.genomix.pregelix.api.io.binary.BinaryVertexOutputFormat;
+import edu.uci.ics.pregelix.api.graph.Vertex;
+import edu.uci.ics.pregelix.api.io.VertexWriter;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.pregelix.type.State;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+
+public class LogAlgorithmForPathMergeOutputFormat extends
+ BinaryVertexOutputFormat<KmerBytesWritable, ValueStateWritable, NullWritable> {
+
+
+ @Override
+ public VertexWriter<KmerBytesWritable, ValueStateWritable, NullWritable> createVertexWriter(TaskAttemptContext context)
+ throws IOException, InterruptedException {
+ @SuppressWarnings("unchecked")
+ RecordWriter<KmerBytesWritable, ValueStateWritable> recordWriter = binaryOutputFormat.getRecordWriter(context);
+ return new BinaryLoadGraphVertexWriter(recordWriter);
+ }
+
+ /**
+ * Simple VertexWriter that supports {@link BinaryLoadGraphVertex}
+ */
+ public static class BinaryLoadGraphVertexWriter extends
+ BinaryVertexWriter<KmerBytesWritable, ValueStateWritable, NullWritable> {
+
+ public BinaryLoadGraphVertexWriter(RecordWriter<KmerBytesWritable, ValueStateWritable> lineRecordWriter) {
+ super(lineRecordWriter);
+ }
+
+ @Override
+ public void writeVertex(Vertex<KmerBytesWritable, ValueStateWritable, NullWritable, ?> vertex) throws IOException,
+ InterruptedException {
+ if(vertex.getVertexValue().getState() != State.FINAL_DELETE
+ && vertex.getVertexValue().getState() != State.END_VERTEX)
+ getRecordWriter().write(vertex.getVertexId(),vertex.getVertexValue());
+ }
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NaiveAlgorithmForPathMergeInputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NaiveAlgorithmForPathMergeInputFormat.java
new file mode 100644
index 0000000..ca134c0
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NaiveAlgorithmForPathMergeInputFormat.java
@@ -0,0 +1,78 @@
+package edu.uci.ics.genomix.pregelix.format;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.pregelix.api.graph.Vertex;
+import edu.uci.ics.pregelix.api.io.VertexReader;
+import edu.uci.ics.pregelix.api.util.BspUtils;
+import edu.uci.ics.genomix.pregelix.io.NaiveAlgorithmMessageWritable;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.pregelix.api.io.binary.BinaryVertexInputFormat;
+import edu.uci.ics.genomix.pregelix.api.io.binary.BinaryVertexInputFormat.BinaryVertexReader;
+
+public class NaiveAlgorithmForPathMergeInputFormat extends
+ BinaryVertexInputFormat<KmerBytesWritable, ValueStateWritable, NullWritable, NaiveAlgorithmMessageWritable>{
+ /**
+ * Format INPUT
+ */
+ @SuppressWarnings("unchecked")
+ @Override
+ public VertexReader<KmerBytesWritable, ValueStateWritable, NullWritable, NaiveAlgorithmMessageWritable> createVertexReader(
+ InputSplit split, TaskAttemptContext context) throws IOException {
+ return new BinaryLoadGraphReader(binaryInputFormat.createRecordReader(split, context));
+ }
+}
+
+@SuppressWarnings("rawtypes")
+class BinaryLoadGraphReader extends
+ BinaryVertexReader<KmerBytesWritable, ValueStateWritable, NullWritable, NaiveAlgorithmMessageWritable> {
+ private Vertex vertex;
+ private KmerBytesWritable vertexId = null;
+ private ValueStateWritable vertexValue = new ValueStateWritable();
+
+ public BinaryLoadGraphReader(RecordReader<KmerBytesWritable, KmerCountValue> recordReader) {
+ super(recordReader);
+ }
+
+ @Override
+ public boolean nextVertex() throws IOException, InterruptedException {
+ return getRecordReader().nextKeyValue();
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Vertex<KmerBytesWritable, ValueStateWritable, NullWritable, NaiveAlgorithmMessageWritable> getCurrentVertex() throws IOException,
+ InterruptedException {
+ if (vertex == null)
+ vertex = (Vertex) BspUtils.createVertex(getContext().getConfiguration());
+
+ vertex.getMsgList().clear();
+ vertex.getEdges().clear();
+
+ vertex.reset();
+ if(getRecordReader() != null){
+ /**
+ * set the src vertex id
+ */
+ if(vertexId == null)
+ vertexId = new KmerBytesWritable(getRecordReader().getCurrentKey().getKmerLength());
+ vertexId.set(getRecordReader().getCurrentKey());
+ vertex.setVertexId(vertexId);
+ /**
+ * set the vertex value
+ */
+ KmerCountValue kmerCountValue = getRecordReader().getCurrentValue();
+ vertexValue.setAdjMap(kmerCountValue.getAdjBitMap());
+ vertex.setVertexValue(vertexValue);
+ }
+
+ return vertex;
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NaiveAlgorithmForPathMergeOutputFormat.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NaiveAlgorithmForPathMergeOutputFormat.java
new file mode 100644
index 0000000..983112b
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/format/NaiveAlgorithmForPathMergeOutputFormat.java
@@ -0,0 +1,41 @@
+package edu.uci.ics.genomix.pregelix.format;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import edu.uci.ics.genomix.pregelix.api.io.binary.BinaryVertexOutputFormat;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.pregelix.api.graph.Vertex;
+import edu.uci.ics.pregelix.api.io.VertexWriter;
+
+public class NaiveAlgorithmForPathMergeOutputFormat extends
+ BinaryVertexOutputFormat<KmerBytesWritable, ValueStateWritable, NullWritable> {
+
+ @Override
+ public VertexWriter<KmerBytesWritable, ValueStateWritable, NullWritable> createVertexWriter(TaskAttemptContext context)
+ throws IOException, InterruptedException {
+ @SuppressWarnings("unchecked")
+ RecordWriter<KmerBytesWritable, ValueStateWritable> recordWriter = binaryOutputFormat.getRecordWriter(context);
+ return new BinaryLoadGraphVertexWriter(recordWriter);
+ }
+
+ /**
+ * Simple VertexWriter that supports {@link BinaryLoadGraphVertex}
+ */
+ public static class BinaryLoadGraphVertexWriter extends
+ BinaryVertexWriter<KmerBytesWritable, ValueStateWritable, NullWritable> {
+ public BinaryLoadGraphVertexWriter(RecordWriter<KmerBytesWritable, ValueStateWritable> lineRecordWriter) {
+ super(lineRecordWriter);
+ }
+
+ @Override
+ public void writeVertex(Vertex<KmerBytesWritable, ValueStateWritable, NullWritable, ?> vertex) throws IOException,
+ InterruptedException {
+ getRecordWriter().write(vertex.getVertexId(),vertex.getVertexValue());
+ }
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/graph/Graph.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/graph/Graph.java
new file mode 100644
index 0000000..df09a64
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/graph/Graph.java
@@ -0,0 +1,55 @@
+package edu.uci.ics.genomix.pregelix.graph;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+
+public class Graph {
+
+ /**
+ * Construct a DOT graph in memory, convert it
+ * to image and store the image in the file system.
+ * @throws Exception
+ */
+ private void start(String fileName) throws Exception
+ {
+ File filePathTo = new File("graph/" + fileName);
+ BufferedReader br = new BufferedReader(new FileReader(filePathTo));
+ String line = "";
+ String[] split;
+
+ String precursor = "";
+ String[] adjMap;
+ char[] succeeds;
+ String succeed = "";
+ String output;
+
+ GraphViz gv = new GraphViz();
+ gv.addln(gv.start_graph());
+ while((line = br.readLine()) != null){
+ split = line.split("\t");
+ precursor = split[0];
+ adjMap = split[1].split("\\|");
+ if(adjMap.length > 1){
+ succeeds = adjMap[1].toCharArray();
+ for(int i = 0; i < succeeds.length; i++){
+ succeed = precursor.substring(1) + succeeds[i];
+ output = precursor + " -> " + succeed;
+ gv.addln(output);
+ }
+ }
+ }
+ gv.addln(gv.end_graph());
+ System.out.println(gv.getDotSource());
+
+ String type = "ps";
+ File out = new File("graph/" + fileName + "_out." + type); // Linux
+ gv.writeGraphToFile(gv.getGraph(gv.getDotSource(), type), out);
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ Graph g = new Graph();
+ g.start("Path");
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/graph/GraphViz.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/graph/GraphViz.java
new file mode 100644
index 0000000..c2178bc
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/graph/GraphViz.java
@@ -0,0 +1,288 @@
+package edu.uci.ics.genomix.pregelix.graph;
+
+// GraphViz.java - a simple API to call dot from Java programs
+
+/*$Id$*/
+/*
+ ******************************************************************************
+ * *
+ * (c) Copyright 2003 Laszlo Szathmary *
+ * *
+ * This program is free software; you can redistribute it and/or modify it *
+ * under the terms of the GNU Lesser General Public License as published by *
+ * the Free Software Foundation; either version 2.1 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, but *
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public *
+ * License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the Free Software Foundation, *
+ * Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *
+ * *
+ ******************************************************************************
+ */
+
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.InputStreamReader;
+
+/**
+ * <dl>
+ * <dt>Purpose: GraphViz Java API
+ * <dd>
+ *
+ * <dt>Description:
+ * <dd> With this Java class you can simply call dot
+ * from your Java programs
+ * <dt>Example usage:
+ * <dd>
+ * <pre>
+ * GraphViz gv = new GraphViz();
+ * gv.addln(gv.start_graph());
+ * gv.addln("A -> B;");
+ * gv.addln("A -> C;");
+ * gv.addln(gv.end_graph());
+ * System.out.println(gv.getDotSource());
+ *
+ * String type = "gif";
+ * File out = new File("out." + type); // out.gif in this example
+ * gv.writeGraphToFile( gv.getGraph( gv.getDotSource(), type ), out );
+ * </pre>
+ * </dd>
+ *
+ * </dl>
+ *
+ * @version v0.4, 2011/02/05 (February) -- Patch of Keheliya Gallaba is added. Now you
+ * can specify the type of the output file: gif, dot, fig, pdf, ps, svg, png, etc.
+ * @version v0.3, 2010/11/29 (November) -- Windows support + ability
+ * to read the graph from a text file
+ * @version v0.2, 2010/07/22 (July) -- bug fix
+ * @version v0.1, 2003/12/04 (December) -- first release
+ * @author Laszlo Szathmary (<a href="jabba.laci@gmail.com">jabba.laci@gmail.com</a>)
+ */
+public class GraphViz
+{
+ /**
+ * The dir. where temporary files will be created.
+ */
+ private static String TEMP_DIR = "/tmp"; // Linux
+ // private static String TEMP_DIR = "c:/temp"; // Windows
+
+ /**
+ * Where is your dot program located? It will be called externally.
+ */
+ private static String DOT = "/usr/bin/dot"; // Linux
+// private static String DOT = "c:/Program Files/Graphviz2.26.3/bin/dot.exe"; // Windows
+
+ /**
+ * The source of the graph written in dot language.
+ */
+ private StringBuilder graph = new StringBuilder();
+
+ /**
+ * Constructor: creates a new GraphViz object that will contain
+ * a graph.
+ */
+ public GraphViz() {
+ }
+
+ /**
+ * Returns the graph's source description in dot language.
+ * @return Source of the graph in dot language.
+ */
+ public String getDotSource() {
+ return graph.toString();
+ }
+
+ /**
+ * Adds a string to the graph's source (without newline).
+ */
+ public void add(String line) {
+ graph.append(line);
+ }
+
+ /**
+ * Adds a string to the graph's source (with newline).
+ */
+ public void addln(String line) {
+ graph.append(line + "\n");
+ }
+
+ /**
+ * Adds a newline to the graph's source.
+ */
+ public void addln() {
+ graph.append('\n');
+ }
+
+ /**
+ * Returns the graph as an image in binary format.
+ * @param dot_source Source of the graph to be drawn.
+ * @param type Type of the output image to be produced, e.g.: gif, dot, fig, pdf, ps, svg, png.
+ * @return A byte array containing the image of the graph.
+ */
+ public byte[] getGraph(String dot_source, String type)
+ {
+ File dot;
+ byte[] img_stream = null;
+
+ try {
+ dot = writeDotSourceToFile(dot_source);
+ if (dot != null)
+ {
+ img_stream = get_img_stream(dot, type);
+ if (dot.delete() == false)
+ System.err.println("Warning: " + dot.getAbsolutePath() + " could not be deleted!");
+ return img_stream;
+ }
+ return null;
+ } catch (java.io.IOException ioe) { return null; }
+ }
+
+ /**
+ * Writes the graph's image in a file.
+ * @param img A byte array containing the image of the graph.
+ * @param file Name of the file to where we want to write.
+ * @return Success: 1, Failure: -1
+ */
+ public int writeGraphToFile(byte[] img, String file)
+ {
+ File to = new File(file);
+ return writeGraphToFile(img, to);
+ }
+
+ /**
+ * Writes the graph's image in a file.
+ * @param img A byte array containing the image of the graph.
+ * @param to A File object to where we want to write.
+ * @return Success: 1, Failure: -1
+ */
+ public int writeGraphToFile(byte[] img, File to)
+ {
+ try {
+ FileOutputStream fos = new FileOutputStream(to);
+ fos.write(img);
+ fos.close();
+ } catch (java.io.IOException ioe) { return -1; }
+ return 1;
+ }
+
+ /**
+ * It will call the external dot program, and return the image in
+ * binary format.
+ * @param dot Source of the graph (in dot language).
+ * @param type Type of the output image to be produced, e.g.: gif, dot, fig, pdf, ps, svg, png.
+ * @return The image of the graph in .gif format.
+ */
+ private byte[] get_img_stream(File dot, String type)
+ {
+ File img;
+ byte[] img_stream = null;
+
+ try {
+ img = File.createTempFile("graph_", "."+type, new File(GraphViz.TEMP_DIR));
+ Runtime rt = Runtime.getRuntime();
+
+ // patch by Mike Chenault
+ String[] args = {DOT, "-T"+type, dot.getAbsolutePath(), "-o", img.getAbsolutePath()};
+ Process p = rt.exec(args);
+
+ p.waitFor();
+
+ FileInputStream in = new FileInputStream(img.getAbsolutePath());
+ img_stream = new byte[in.available()];
+ in.read(img_stream);
+ // Close it if we need to
+ if( in != null ) in.close();
+
+ if (img.delete() == false)
+ System.err.println("Warning: " + img.getAbsolutePath() + " could not be deleted!");
+ }
+ catch (java.io.IOException ioe) {
+ System.err.println("Error: in I/O processing of tempfile in dir " + GraphViz.TEMP_DIR+"\n");
+ System.err.println(" or in calling external command");
+ ioe.printStackTrace();
+ }
+ catch (java.lang.InterruptedException ie) {
+ System.err.println("Error: the execution of the external program was interrupted");
+ ie.printStackTrace();
+ }
+
+ return img_stream;
+ }
+
+ /**
+ * Writes the source of the graph in a file, and returns the written file
+ * as a File object.
+ * @param str Source of the graph (in dot language).
+ * @return The file (as a File object) that contains the source of the graph.
+ */
+ private File writeDotSourceToFile(String str) throws java.io.IOException
+ {
+ File temp;
+ try {
+ temp = File.createTempFile("graph_", ".dot.tmp", new File(GraphViz.TEMP_DIR));
+ FileWriter fout = new FileWriter(temp);
+ fout.write(str);
+ fout.close();
+ }
+ catch (Exception e) {
+ System.err.println("Error: I/O error while writing the dot source to temp file!");
+ return null;
+ }
+ return temp;
+ }
+
+ /**
+ * Returns a string that is used to start a graph.
+ * @return A string to open a graph.
+ */
+ public String start_graph() {
+ return "digraph G {";
+ }
+
+ /**
+ * Returns a string that is used to end a graph.
+ * @return A string to close a graph.
+ */
+ public String end_graph() {
+ return "}";
+ }
+
+ /**
+ * Read a DOT graph from a text file.
+ *
+ * @param input Input text file containing the DOT graph
+ * source.
+ */
+ public void readSource(String input)
+ {
+ StringBuilder sb = new StringBuilder();
+
+ try
+ {
+ FileInputStream fis = new FileInputStream(input);
+ DataInputStream dis = new DataInputStream(fis);
+ BufferedReader br = new BufferedReader(new InputStreamReader(dis));
+ String line;
+ while ((line = br.readLine()) != null) {
+ sb.append(line);
+ }
+ dis.close();
+ }
+ catch (Exception e) {
+ System.err.println("Error: " + e.getMessage());
+ }
+
+ this.graph = sb;
+ }
+
+} // end of class GraphViz
+
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/LogAlgorithmMessageWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/LogAlgorithmMessageWritable.java
new file mode 100644
index 0000000..739bacb
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/LogAlgorithmMessageWritable.java
@@ -0,0 +1,132 @@
+package edu.uci.ics.genomix.pregelix.io;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparable;
+
+import edu.uci.ics.genomix.pregelix.operator.LogAlgorithmForPathMergeVertex;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class LogAlgorithmMessageWritable implements WritableComparable<LogAlgorithmMessageWritable>{
+ /**
+ * sourceVertexId stores source vertexId when headVertex sends the message
+ * stores neighber vertexValue when pathVertex sends the message
+ * chainVertexId stores the chains of connected DNA
+ * file stores the point to the file that stores the chains of connected DNA
+ */
+ private KmerBytesWritable sourceVertexId;
+ private VKmerBytesWritable chainVertexId;
+ private byte adjMap;
+ private int message;
+ private int sourceVertexState;
+
+ public LogAlgorithmMessageWritable(){
+ sourceVertexId = new VKmerBytesWritable(LogAlgorithmForPathMergeVertex.kmerSize);
+ chainVertexId = new VKmerBytesWritable(LogAlgorithmForPathMergeVertex.kmerSize);
+ }
+
+ public void set(KmerBytesWritable sourceVertexId, VKmerBytesWritable chainVertexId, byte adjMap, int message, int sourceVertexState){
+ this.sourceVertexId.set(sourceVertexId);
+ this.chainVertexId.set(chainVertexId);
+ this.adjMap = adjMap;
+ this.message = message;
+ this.sourceVertexState = sourceVertexState;
+ }
+
+ public void reset(){
+ //sourceVertexId.reset(LogAlgorithmForPathMergeVertex.kmerSize);
+ chainVertexId.reset(LogAlgorithmForPathMergeVertex.kmerSize);
+ adjMap = (byte)0;
+ message = 0;
+ sourceVertexState = 0;
+ }
+
+ public KmerBytesWritable getSourceVertexId() {
+ return sourceVertexId;
+ }
+
+ public void setSourceVertexId(KmerBytesWritable sourceVertexId) {
+ this.sourceVertexId.set(sourceVertexId);
+ }
+
+ public byte getAdjMap() {
+ return adjMap;
+ }
+
+ public void setAdjMap(byte adjMap) {
+ this.adjMap = adjMap;
+ }
+
+ public VKmerBytesWritable getChainVertexId() {
+ return chainVertexId;
+ }
+
+ public void setChainVertexId(VKmerBytesWritable chainVertexId) {
+ this.chainVertexId.set(chainVertexId);
+ }
+
+ public int getMessage() {
+ return message;
+ }
+
+ public void setMessage(int message) {
+ this.message = message;
+ }
+
+ public int getSourceVertexState() {
+ return sourceVertexState;
+ }
+
+ public void setSourceVertexState(int sourceVertexState) {
+ this.sourceVertexState = sourceVertexState;
+ }
+
+ public int getLengthOfChain() {
+ return chainVertexId.getKmerLength();
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ sourceVertexId.write(out);
+ chainVertexId.write(out);
+ out.write(adjMap);
+ out.writeInt(message);
+ out.writeInt(sourceVertexState);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ sourceVertexId.readFields(in);
+ chainVertexId.readFields(in);
+ adjMap = in.readByte();
+ message = in.readInt();
+ sourceVertexState = in.readInt();
+ }
+
+ @Override
+ public int hashCode() {
+ return chainVertexId.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof NaiveAlgorithmMessageWritable) {
+ LogAlgorithmMessageWritable tp = (LogAlgorithmMessageWritable) o;
+ return chainVertexId.equals(tp.chainVertexId);
+ }
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return chainVertexId.toString();
+ }
+
+ @Override
+ public int compareTo(LogAlgorithmMessageWritable tp) {
+ return chainVertexId.compareTo(tp.chainVertexId);
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/NaiveAlgorithmMessageWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/NaiveAlgorithmMessageWritable.java
new file mode 100644
index 0000000..55a626d
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/NaiveAlgorithmMessageWritable.java
@@ -0,0 +1,125 @@
+package edu.uci.ics.genomix.pregelix.io;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparable;
+
+import edu.uci.ics.genomix.pregelix.operator.NaiveAlgorithmForPathMergeVertex;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class NaiveAlgorithmMessageWritable implements WritableComparable<NaiveAlgorithmMessageWritable>{
+ /**
+ * sourceVertexId stores source vertexId when headVertex sends the message
+ * stores neighber vertexValue when pathVertex sends the message
+ * chainVertexId stores the chains of connected DNA
+ * file stores the point to the file that stores the chains of connected DNA
+ */
+ private KmerBytesWritable sourceVertexId;
+ private VKmerBytesWritable chainVertexId;
+ private KmerBytesWritable headVertexId;
+ private byte adjMap;
+ private boolean isRear;
+
+ public NaiveAlgorithmMessageWritable(){
+ sourceVertexId = new VKmerBytesWritable(NaiveAlgorithmForPathMergeVertex.kmerSize);
+ chainVertexId = new VKmerBytesWritable(NaiveAlgorithmForPathMergeVertex.kmerSize);
+ headVertexId = new VKmerBytesWritable(NaiveAlgorithmForPathMergeVertex.kmerSize);
+ }
+
+ public void set(KmerBytesWritable sourceVertex, VKmerBytesWritable chainVertex, KmerBytesWritable headVertex , byte adjMap, boolean isRear){
+ this.sourceVertexId.set(sourceVertex);
+ this.chainVertexId.set(chainVertex);
+ this.headVertexId.set(headVertex);
+ this.adjMap = adjMap;
+ this.isRear = isRear;
+ }
+
+ public KmerBytesWritable getSourceVertexId() {
+ return sourceVertexId;
+ }
+
+ public void setSourceVertexId(KmerBytesWritable source) {
+ this.sourceVertexId.set(source);
+ }
+
+ public byte getAdjMap() {
+ return adjMap;
+ }
+
+ public void setAdjMap(byte adjMap) {
+ this.adjMap = adjMap;
+ }
+
+ public void setChainVertexId(VKmerBytesWritable chainVertex) {
+ this.chainVertexId.set(chainVertex);
+ }
+
+ public VKmerBytesWritable getChainVertexId() {
+ return chainVertexId;
+ }
+
+ public boolean isRear() {
+ return isRear;
+ }
+
+ public void setRear(boolean isRear) {
+ this.isRear = isRear;
+ }
+
+ public int getLengthOfChain() {
+ return this.chainVertexId.getKmerLength();
+ }
+
+
+ public KmerBytesWritable getHeadVertexId() {
+ return headVertexId;
+ }
+
+ public void setHeadVertexId(KmerBytesWritable headVertexId) {
+ this.headVertexId.set(headVertexId);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ sourceVertexId.write(out);
+ headVertexId.write(out);
+ chainVertexId.write(out);
+ out.write(adjMap);
+ out.writeBoolean(isRear);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ sourceVertexId.readFields(in);
+ headVertexId.readFields(in);
+ chainVertexId.readFields(in);
+ adjMap = in.readByte();
+ isRear = in.readBoolean();
+ }
+
+ @Override
+ public int hashCode() {
+ return chainVertexId.hashCode();
+ }
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof NaiveAlgorithmMessageWritable) {
+ NaiveAlgorithmMessageWritable tp = (NaiveAlgorithmMessageWritable) o;
+ return chainVertexId.equals( tp.chainVertexId);
+ }
+ return false;
+ }
+ @Override
+ public String toString() {
+ return chainVertexId.toString();
+ }
+
+ @Override
+ public int compareTo(NaiveAlgorithmMessageWritable tp) {
+ return chainVertexId.compareTo(tp.chainVertexId);
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/ValueStateWritable.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/ValueStateWritable.java
new file mode 100644
index 0000000..769277c
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/io/ValueStateWritable.java
@@ -0,0 +1,99 @@
+package edu.uci.ics.genomix.pregelix.io;
+
+import java.io.*;
+
+import org.apache.hadoop.io.WritableComparable;
+
+import edu.uci.ics.genomix.pregelix.operator.NaiveAlgorithmForPathMergeVertex;
+import edu.uci.ics.genomix.pregelix.type.State;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+
+public class ValueStateWritable implements WritableComparable<ValueStateWritable> {
+
+ private byte adjMap;
+ private int state;
+ private VKmerBytesWritable mergeChain;
+
+ public ValueStateWritable() {
+ state = State.NON_VERTEX;
+ mergeChain = new VKmerBytesWritable(NaiveAlgorithmForPathMergeVertex.kmerSize);
+ }
+
+ public ValueStateWritable(byte adjMap, int state, VKmerBytesWritable mergeChain) {
+ this.adjMap = adjMap;
+ this.state = state;
+ this.mergeChain.set(mergeChain);
+ }
+
+ public void set(byte adjMap, int state, VKmerBytesWritable mergeChain){
+ this.adjMap = adjMap;
+ this.state = state;
+ this.mergeChain.set(mergeChain);
+ }
+
+ public byte getAdjMap() {
+ return adjMap;
+ }
+
+ public void setAdjMap(byte adjMap) {
+ this.adjMap = adjMap;
+ }
+
+ public int getState() {
+ return state;
+ }
+
+ public void setState(int state) {
+ this.state = state;
+ }
+
+ public int getLengthOfMergeChain() {
+ return mergeChain.getKmerLength();
+ }
+
+ public VKmerBytesWritable getMergeChain() {
+ return mergeChain;
+ }
+
+ public void setMergeChain(KmerBytesWritable mergeChain) {
+ this.mergeChain.set(mergeChain);
+ }
+
+ public void setMergeChain(VKmerBytesWritable mergeChain) {
+ this.mergeChain.set(mergeChain);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ adjMap = in.readByte();
+ state = in.readInt();
+ mergeChain.readFields(in);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(adjMap);
+ out.writeInt(state);
+ mergeChain.write(out);
+ }
+
+ @Override
+ public int compareTo(ValueStateWritable o) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public String toString() {
+ if(mergeChain.getKmerLength() == 0)
+ return GeneCode.getSymbolFromBitMap(adjMap);
+ return GeneCode.getSymbolFromBitMap(adjMap) + "\t" +
+ getLengthOfMergeChain() + "\t" +
+ mergeChain.toString() + "\t" +
+ state;
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/DataLoadLogFormatter.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/DataLoadLogFormatter.java
new file mode 100644
index 0000000..6105f18
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/DataLoadLogFormatter.java
@@ -0,0 +1,37 @@
+package edu.uci.ics.genomix.pregelix.log;
+
+import java.util.logging.Formatter;
+import java.util.logging.Handler;
+import java.util.logging.LogRecord;
+
+import edu.uci.ics.genomix.type.KmerCountValue;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class DataLoadLogFormatter extends Formatter{
+ private VKmerBytesWritable key;
+ private KmerCountValue value;
+
+ public void set(VKmerBytesWritable key,
+ KmerCountValue value){
+ this.key.set(key);
+ this.value = value;
+ }
+ public String format(LogRecord record) {
+ StringBuilder builder = new StringBuilder(1000);
+
+ builder.append(key.toString()
+ + "\t" + value.toString() + "\r\n");
+
+ if(!formatMessage(record).equals(""))
+ builder.append(formatMessage(record) + "\r\n");
+ return builder.toString();
+ }
+
+ public String getHead(Handler h) {
+ return super.getHead(h);
+ }
+
+ public String getTail(Handler h) {
+ return super.getTail(h);
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/LogAlgorithmLogFormatter.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/LogAlgorithmLogFormatter.java
new file mode 100644
index 0000000..d4f03ee
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/LogAlgorithmLogFormatter.java
@@ -0,0 +1,111 @@
+package edu.uci.ics.genomix.pregelix.log;
+
+import java.util.logging.*;
+
+import edu.uci.ics.genomix.pregelix.io.LogAlgorithmMessageWritable;
+import edu.uci.ics.genomix.pregelix.type.Message;
+import edu.uci.ics.genomix.pregelix.type.State;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class LogAlgorithmLogFormatter extends Formatter {
+ //
+ // Create a DateFormat to format the logger timestamp.
+ //
+ //private static final DateFormat df = new SimpleDateFormat("dd/MM/yyyy hh:mm:ss.SSS");
+ private long step;
+ private VKmerBytesWritable sourceVertexId = new VKmerBytesWritable(1);
+ private VKmerBytesWritable destVertexId = new VKmerBytesWritable(1);
+ private LogAlgorithmMessageWritable msg = new LogAlgorithmMessageWritable();
+ private int state;
+ private VKmerBytesWritable mergeChain = new VKmerBytesWritable(1);;
+ //private boolean testDelete = false;
+ /** 0: general operation
+ * 1: testDelete
+ * 2: testMergeChain
+ * 3: testVoteToHalt
+ */
+ private int operation;
+
+ public LogAlgorithmLogFormatter(){
+ }
+
+ public void set(long step, VKmerBytesWritable sourceVertexId,
+ VKmerBytesWritable destVertexId, LogAlgorithmMessageWritable msg, int state){
+ this.step = step;
+ this.sourceVertexId.set(sourceVertexId);
+ this.destVertexId.set(destVertexId);
+ this.msg = msg;
+ this.state = state;
+ this.operation = 0;
+ }
+ public void setMergeChain(long step, VKmerBytesWritable sourceVertexId,
+ VKmerBytesWritable mergeChain){
+ this.reset();
+ this.step = step;
+ this.sourceVertexId.set(sourceVertexId);
+ this.mergeChain.set(mergeChain);
+ this.operation = 2;
+ }
+ public void setVotoToHalt(long step, VKmerBytesWritable sourceVertexId){
+ this.reset();
+ this.step = step;
+ this.sourceVertexId.set(sourceVertexId);
+ this.operation = 3;
+ }
+ public void reset(){
+ this.sourceVertexId = new VKmerBytesWritable(1);
+ this.destVertexId = new VKmerBytesWritable(1);
+ this.msg = new LogAlgorithmMessageWritable();
+ this.state = 0;
+ this.mergeChain = new VKmerBytesWritable(1);
+ }
+ public String format(LogRecord record) {
+ StringBuilder builder = new StringBuilder(1000);
+ String source = sourceVertexId.toString();
+ String chain = "";
+
+ builder.append("Step: " + step + "\r\n");
+ builder.append("Source Code: " + source + "\r\n");
+ if(operation == 0){
+ if(destVertexId.getKmerLength() != -1){
+ String dest = destVertexId.toString();
+ builder.append("Send message to " + "\r\n");
+ builder.append("Destination Code: " + dest + "\r\n");
+ }
+ builder.append("Message is: " + Message.MESSAGE_CONTENT.getContentFromCode(msg.getMessage()) + "\r\n");
+
+ if(msg.getLengthOfChain() != -1){
+ chain = msg.getChainVertexId().toString();
+ builder.append("Chain Message: " + chain + "\r\n");
+ builder.append("Chain Length: " + msg.getLengthOfChain() + "\r\n");
+ }
+
+ builder.append("State is: " + State.STATE_CONTENT.getContentFromCode(state) + "\r\n");
+ }
+ if(operation == 2){
+ chain = mergeChain.toString();
+ builder.append("Merge Chain: " + chain + "\r\n");
+ builder.append("Merge Chain Length: " + mergeChain.getKmerLength() + "\r\n");
+ }
+ if(operation == 3)
+ builder.append("Vote to halt!");
+ if(!formatMessage(record).equals(""))
+ builder.append(formatMessage(record) + "\r\n");
+ builder.append("\n");
+ return builder.toString();
+ }
+
+ public String getHead(Handler h) {
+ return super.getHead(h);
+ }
+
+ public String getTail(Handler h) {
+ return super.getTail(h);
+ }
+ public int getOperation() {
+ return operation;
+ }
+ public void setOperation(int operation) {
+ this.operation = operation;
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/NaiveAlgorithmLogFormatter.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/NaiveAlgorithmLogFormatter.java
new file mode 100644
index 0000000..332d6d0
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/log/NaiveAlgorithmLogFormatter.java
@@ -0,0 +1,57 @@
+package edu.uci.ics.genomix.pregelix.log;
+
+import java.util.logging.*;
+
+import edu.uci.ics.genomix.pregelix.io.NaiveAlgorithmMessageWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class NaiveAlgorithmLogFormatter extends Formatter {
+ //
+ // Create a DateFormat to format the logger timestamp.
+ //
+ //private static final DateFormat df = new SimpleDateFormat("dd/MM/yyyy hh:mm:ss.SSS");
+ private long step;
+ private VKmerBytesWritable sourceVertexId;
+ private VKmerBytesWritable destVertexId;
+ private NaiveAlgorithmMessageWritable msg;
+
+ public void set(long step, VKmerBytesWritable sourceVertexId,
+ VKmerBytesWritable destVertexId, NaiveAlgorithmMessageWritable msg){
+ this.step = step;
+ this.sourceVertexId.set(sourceVertexId);
+ this.destVertexId.set(destVertexId);
+ this.msg = msg;
+ }
+ public String format(LogRecord record) {
+ StringBuilder builder = new StringBuilder(1000);
+ String source = sourceVertexId.toString();
+
+ String chain = "";
+
+ builder.append("Step: " + step + "\r\n");
+ builder.append("Source Code: " + source + "\r\n");
+
+ if(destVertexId != null){
+ builder.append("Send message to " + "\r\n");
+ String dest = destVertexId.toString();
+ builder.append("Destination Code: " + dest + "\r\n");
+ }
+ if(msg.getLengthOfChain() != 0){
+ chain = msg.getChainVertexId().toString();
+ builder.append("Chain Message: " + chain + "\r\n");
+ builder.append("Chain Length: " + msg.getLengthOfChain() + "\r\n");
+ }
+ if(!formatMessage(record).equals(""))
+ builder.append(formatMessage(record) + "\r\n");
+ builder.append("\n");
+ return builder.toString();
+ }
+
+ public String getHead(Handler h) {
+ return super.getHead(h);
+ }
+
+ public String getTail(Handler h) {
+ return super.getTail(h);
+ }
+}
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/LoadGraphVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/LoadGraphVertex.java
new file mode 100644
index 0000000..6fef3a6
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/LoadGraphVertex.java
@@ -0,0 +1,70 @@
+package edu.uci.ics.genomix.pregelix.operator;
+
+import java.util.Iterator;
+
+import org.apache.hadoop.io.NullWritable;
+
+import edu.uci.ics.genomix.pregelix.client.Client;
+import edu.uci.ics.genomix.pregelix.format.NaiveAlgorithmForPathMergeInputFormat;
+import edu.uci.ics.genomix.pregelix.format.NaiveAlgorithmForPathMergeOutputFormat;
+import edu.uci.ics.genomix.pregelix.io.NaiveAlgorithmMessageWritable;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.pregelix.api.graph.Vertex;
+import edu.uci.ics.pregelix.api.job.PregelixJob;
+
+/*
+ * vertexId: BytesWritable
+ * vertexValue: ByteWritable
+ * edgeValue: NullWritable
+ * message: NaiveAlgorithmMessageWritable
+ *
+ * DNA:
+ * A: 00
+ * C: 01
+ * G: 10
+ * T: 11
+ *
+ * succeed node
+ * A 00000001 1
+ * G 00000010 2
+ * C 00000100 4
+ * T 00001000 8
+ * precursor node
+ * A 00010000 16
+ * G 00100000 32
+ * C 01000000 64
+ * T 10000000 128
+ *
+ * For example, ONE LINE in input file: 00,01,10 0001,0010,
+ * That means that vertexId is ACG, its succeed node is A and its precursor node is C.
+ * The succeed node and precursor node will be stored in vertexValue and we don't use edgeValue.
+ * The details about message are in edu.uci.ics.pregelix.example.io.MessageWritable.
+ */
+public class LoadGraphVertex extends Vertex<KmerBytesWritable, ValueStateWritable, NullWritable, NaiveAlgorithmMessageWritable>{
+
+ /**
+ * For test, just output original file
+ */
+ @Override
+ public void compute(Iterator<NaiveAlgorithmMessageWritable> msgIterator) {
+ voteToHalt();
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception {
+ PregelixJob job = new PregelixJob(LoadGraphVertex.class.getSimpleName());
+ job.setVertexClass(LoadGraphVertex.class);
+ /**
+ * BinaryInput and BinaryOutput
+ */
+ job.setVertexInputFormatClass(NaiveAlgorithmForPathMergeInputFormat.class);
+ job.setVertexOutputFormatClass(NaiveAlgorithmForPathMergeOutputFormat.class);
+ job.setDynamicVertexValueSize(true);
+ job.setOutputKeyClass(KmerBytesWritable.class);
+ job.setOutputValueClass(ValueStateWritable.class);
+ Client.run(args, job);
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/LogAlgorithmForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/LogAlgorithmForPathMergeVertex.java
new file mode 100644
index 0000000..398d7ff
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/LogAlgorithmForPathMergeVertex.java
@@ -0,0 +1,358 @@
+package edu.uci.ics.genomix.pregelix.operator;
+
+import java.util.Iterator;
+
+import org.apache.hadoop.io.NullWritable;
+
+import edu.uci.ics.pregelix.api.graph.Vertex;
+import edu.uci.ics.pregelix.api.job.PregelixJob;
+import edu.uci.ics.genomix.pregelix.client.Client;
+import edu.uci.ics.genomix.pregelix.format.LogAlgorithmForPathMergeInputFormat;
+import edu.uci.ics.genomix.pregelix.format.LogAlgorithmForPathMergeOutputFormat;
+import edu.uci.ics.genomix.pregelix.io.LogAlgorithmMessageWritable;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.pregelix.type.Message;
+import edu.uci.ics.genomix.pregelix.type.State;
+import edu.uci.ics.genomix.pregelix.util.GraphVertexOperation;
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+/*
+ * vertexId: BytesWritable
+ * vertexValue: ValueStateWritable
+ * edgeValue: NullWritable
+ * message: LogAlgorithmMessageWritable
+ *
+ * DNA:
+ * A: 00
+ * C: 01
+ * G: 10
+ * T: 11
+ *
+ * succeed node
+ * A 00000001 1
+ * G 00000010 2
+ * C 00000100 4
+ * T 00001000 8
+ * precursor node
+ * A 00010000 16
+ * G 00100000 32
+ * C 01000000 64
+ * T 10000000 128
+ *
+ * For example, ONE LINE in input file: 00,01,10 0001,0010,
+ * That means that vertexId is ACG, its succeed node is A and its precursor node is C.
+ * The succeed node and precursor node will be stored in vertexValue and we don't use edgeValue.
+ * The details about message are in edu.uci.ics.pregelix.example.io.MessageWritable.
+ */
+public class LogAlgorithmForPathMergeVertex extends Vertex<KmerBytesWritable, ValueStateWritable, NullWritable, LogAlgorithmMessageWritable>{
+
+ public static final String KMER_SIZE = "LogAlgorithmForPathMergeVertex.kmerSize";
+ public static final String ITERATIONS = "LogAlgorithmForPathMergeVertex.iteration";
+ public static int kmerSize = -1;
+ private int maxIteration = -1;
+
+ private LogAlgorithmMessageWritable msg = new LogAlgorithmMessageWritable();
+
+ private VKmerBytesWritableFactory kmerFactory = new VKmerBytesWritableFactory(1);
+ private VKmerBytesWritable destVertexId = new VKmerBytesWritable(1);
+ private VKmerBytesWritable chainVertexId = new VKmerBytesWritable(1);
+ private VKmerBytesWritable lastKmer = new VKmerBytesWritable(1);
+ /**
+ * initiate kmerSize, maxIteration
+ */
+ public void initVertex(){
+ if(kmerSize == -1)
+ kmerSize = getContext().getConfiguration().getInt(KMER_SIZE, 5);
+ if (maxIteration < 0)
+ maxIteration = getContext().getConfiguration().getInt(ITERATIONS, 100);
+ }
+ /**
+ * get destination vertex
+ */
+ public VKmerBytesWritable getNextDestVertexId(KmerBytesWritable vertexId, byte geneCode){
+ return kmerFactory.shiftKmerWithNextCode(vertexId, geneCode);
+ }
+
+ public VKmerBytesWritable getPreDestVertexId(KmerBytesWritable vertexId, byte geneCode){
+ return kmerFactory.shiftKmerWithPreCode(vertexId, geneCode);
+ }
+
+ public VKmerBytesWritable getNextDestVertexIdFromBitmap(KmerBytesWritable chainVertexId, byte adjMap){
+ return getDestVertexIdFromChain(chainVertexId, adjMap);//GeneCode.getGeneCodeFromBitMap((byte)(adjMap & 0x0F)
+ }
+
+ public VKmerBytesWritable getDestVertexIdFromChain(KmerBytesWritable chainVertexId, byte adjMap){
+ lastKmer.set(kmerFactory.getLastKmerFromChain(kmerSize, chainVertexId));
+ return getNextDestVertexId(lastKmer, GeneCode.getGeneCodeFromBitMap((byte)(adjMap & 0x0F)));
+ }
+ /**
+ * head send message to all next nodes
+ */
+ public void sendMsgToAllNextNodes(KmerBytesWritable vertexId, byte adjMap){
+ for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
+ if((adjMap & (1 << x)) != 0){
+ destVertexId.set(getNextDestVertexId(vertexId, x));
+ sendMsg(destVertexId, msg);
+ }
+ }
+ }
+ /**
+ * head send message to all previous nodes
+ */
+ public void sendMsgToAllPreviousNodes(KmerBytesWritable vertexId, byte adjMap){
+ for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
+ if(((adjMap >> 4) & (1 << x)) != 0){
+ destVertexId.set(getPreDestVertexId(vertexId, x));
+ sendMsg(destVertexId, msg);
+ }
+ }
+ }
+
+ /**
+ * set vertex state
+ */
+ public void setState(){
+ if(msg.getMessage() == Message.START &&
+ (getVertexValue().getState() == State.MID_VERTEX || getVertexValue().getState() == State.END_VERTEX)){
+ getVertexValue().setState(State.START_VERTEX);
+ setVertexValue(getVertexValue());
+ }
+ else if(msg.getMessage() == Message.END && getVertexValue().getState() == State.MID_VERTEX){
+ getVertexValue().setState(State.END_VERTEX);
+ setVertexValue(getVertexValue());
+ voteToHalt();
+ }
+ else
+ voteToHalt();
+ }
+ /**
+ * send start message to next node
+ */
+ public void sendStartMsgToNextNode(){
+ msg.setMessage(Message.START);
+ msg.setSourceVertexId(getVertexId());
+ sendMsg(destVertexId, msg);
+ voteToHalt();
+ }
+ /**
+ * send end message to next node
+ */
+ public void sendEndMsgToNextNode(){
+ msg.setMessage(Message.END);
+ msg.setSourceVertexId(getVertexId());
+ sendMsg(destVertexId, msg);
+ voteToHalt();
+ }
+ /**
+ * send non message to next node
+ */
+ public void sendNonMsgToNextNode(){
+ msg.setMessage(Message.NON);
+ msg.setSourceVertexId(getVertexId());
+ sendMsg(destVertexId, msg);
+ }
+ /**
+ * head send message to path
+ */
+ public void sendMsgToPathVertex(KmerBytesWritable chainVertexId, byte adjMap){
+ if(GeneCode.getGeneCodeFromBitMap((byte)(getVertexValue().getAdjMap() & 0x0F)) == -1) //|| lastKmer == null
+ voteToHalt();
+ else{
+ destVertexId.set(getNextDestVertexIdFromBitmap(chainVertexId, adjMap));
+ if(getVertexValue().getState() == State.START_VERTEX){
+ sendStartMsgToNextNode();
+ }
+ else if(getVertexValue().getState() != State.END_VERTEX && getVertexValue().getState() != State.FINAL_DELETE){
+ sendEndMsgToNextNode();
+ }
+ }
+ }
+ /**
+ * path send message to head
+ */
+ public void responseMsgToHeadVertex(){
+ if(getVertexValue().getLengthOfMergeChain() == -1){
+ getVertexValue().setMergeChain(getVertexId());
+ setVertexValue(getVertexValue());
+ }
+ msg.set(msg.getSourceVertexId(), getVertexValue().getMergeChain(), getVertexValue().getAdjMap(), msg.getMessage(), getVertexValue().getState());
+ setMessageType(msg.getMessage());
+ destVertexId.set(msg.getSourceVertexId());
+ sendMsg(destVertexId,msg);
+ }
+ /**
+ * set message type
+ */
+ public void setMessageType(int message){
+ //kill Message because it has been merged by the head
+ if(getVertexValue().getState() == State.END_VERTEX || getVertexValue().getState() == State.FINAL_DELETE){
+ msg.setMessage(Message.END);
+ getVertexValue().setState(State.FINAL_DELETE);
+ setVertexValue(getVertexValue());
+ //deleteVertex(getVertexId());
+ }
+ else
+ msg.setMessage(Message.NON);
+
+ if(message == Message.START){
+ getVertexValue().setState(State.TODELETE);
+ setVertexValue(getVertexValue());
+ }
+ }
+ /**
+ * set vertexValue's state chainVertexId, value
+ */
+ public void setVertexValueAttributes(){
+ if(msg.getMessage() == Message.END){
+ if(getVertexValue().getState() != State.START_VERTEX)
+ getVertexValue().setState(State.END_VERTEX);
+ else
+ getVertexValue().setState(State.FINAL_VERTEX);
+ }
+
+ if(getSuperstep() == 5)
+ chainVertexId.set(getVertexId());
+ else
+ chainVertexId.set(getVertexValue().getMergeChain());
+ lastKmer.set(kmerFactory.getLastKmerFromChain(msg.getLengthOfChain() - kmerSize + 1, msg.getChainVertexId()));
+ chainVertexId.set(kmerFactory.mergeTwoKmer(chainVertexId, lastKmer));
+ getVertexValue().setMergeChain(chainVertexId);
+
+ byte tmpVertexValue = GraphVertexOperation.updateRightNeighber(getVertexValue().getAdjMap(), msg.getAdjMap());
+ getVertexValue().setAdjMap(tmpVertexValue);
+ }
+ /**
+ * send message to self
+ */
+ public void sendMsgToSelf(){
+ if(msg.getMessage() != Message.END){
+ setVertexValue(getVertexValue());
+ msg.reset(); //reset
+ msg.setAdjMap(getVertexValue().getAdjMap());
+ sendMsg(getVertexId(),msg);
+ }
+ }
+ /**
+ * start sending message
+ */
+ public void startSendMsg(){
+ if(GraphVertexOperation.isHeadVertex(getVertexValue().getAdjMap())){
+ msg.set(getVertexId(), chainVertexId, (byte)0, Message.START, State.NON_VERTEX); //msg.set(null, (byte)0, chainVertexId, Message.START, State.NON_VERTEX);
+ sendMsgToAllNextNodes(getVertexId(), getVertexValue().getAdjMap());
+ voteToHalt();
+ }
+ if(GraphVertexOperation.isRearVertex(getVertexValue().getAdjMap())){
+ msg.set(getVertexId(), chainVertexId, (byte)0, Message.END, State.NON_VERTEX);
+ sendMsgToAllPreviousNodes(getVertexId(), getVertexValue().getAdjMap());
+ voteToHalt();
+ }
+ if(GraphVertexOperation.isPathVertex(getVertexValue().getAdjMap())){
+ getVertexValue().setState(State.MID_VERTEX);
+ setVertexValue(getVertexValue());
+ }
+ }
+ /**
+ * initiate head, rear and path node
+ */
+ public void initState(Iterator<LogAlgorithmMessageWritable> msgIterator){
+ while(msgIterator.hasNext()){
+ if(!GraphVertexOperation.isPathVertex(getVertexValue().getAdjMap())){
+ msgIterator.next();
+ voteToHalt();
+ }
+ else{
+ msg = msgIterator.next();
+ setState();
+ }
+ }
+ }
+ /**
+ * head send message to path
+ */
+ public void sendMsgToPathVertex(Iterator<LogAlgorithmMessageWritable> msgIterator){
+ if(getSuperstep() == 3){
+ msg.reset();
+ sendMsgToPathVertex(getVertexId(), getVertexValue().getAdjMap());
+ }
+ else{
+ if(msgIterator.hasNext()){
+ msg = msgIterator.next();
+ sendMsgToPathVertex(getVertexValue().getMergeChain(), msg.getAdjMap());
+ }
+ }
+ }
+ /**
+ * path response message to head
+ */
+ public void responseMsgToHeadVertex(Iterator<LogAlgorithmMessageWritable> msgIterator){
+ if(msgIterator.hasNext()){
+ msg = msgIterator.next();
+ responseMsgToHeadVertex();
+ //voteToHalt();
+ }
+ else{
+ if(getVertexValue().getState() != State.START_VERTEX
+ && getVertexValue().getState() != State.END_VERTEX && getVertexValue().getState() != State.FINAL_DELETE){
+ deleteVertex(getVertexId());//killSelf because it doesn't receive any message
+ }
+ }
+ }
+ /**
+ * merge chainVertex and store in vertexVal.chainVertexId
+ */
+ public void mergeChainVertex(Iterator<LogAlgorithmMessageWritable> msgIterator){
+ if(msgIterator.hasNext()){
+ msg = msgIterator.next();
+ setVertexValueAttributes();
+ sendMsgToSelf();
+ }
+ if(getVertexValue().getState() == State.END_VERTEX || getVertexValue().getState() == State.FINAL_DELETE){
+ voteToHalt();
+ }
+ if(getVertexValue().getState() == State.FINAL_VERTEX){
+ //String source = getVertexValue().getMergeChain().toString();
+ voteToHalt();
+ }
+ }
+ @Override
+ public void compute(Iterator<LogAlgorithmMessageWritable> msgIterator) {
+ initVertex();
+ if (getSuperstep() == 1)
+ startSendMsg();
+ else if(getSuperstep() == 2)
+ initState(msgIterator);
+ else if(getSuperstep()%3 == 0 && getSuperstep() <= maxIteration){
+ sendMsgToPathVertex(msgIterator);
+ }
+ else if(getSuperstep()%3 == 1 && getSuperstep() <= maxIteration){
+ responseMsgToHeadVertex(msgIterator);
+ }
+ else if(getSuperstep()%3 == 2 && getSuperstep() <= maxIteration){
+ if(getVertexValue().getState() == State.TODELETE){
+ deleteVertex(getVertexId()); //killSelf
+ }
+ else{
+ mergeChainVertex(msgIterator);
+ }
+ }
+ }
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception {
+ PregelixJob job = new PregelixJob(LogAlgorithmForPathMergeVertex.class.getSimpleName());
+ job.setVertexClass(LogAlgorithmForPathMergeVertex.class);
+ /**
+ * BinaryInput and BinaryOutput~/
+ */
+ job.setVertexInputFormatClass(LogAlgorithmForPathMergeInputFormat.class);
+ job.setVertexOutputFormatClass(LogAlgorithmForPathMergeOutputFormat.class);
+ job.setOutputKeyClass(KmerBytesWritable.class);
+ job.setOutputValueClass(ValueStateWritable.class);
+ job.setDynamicVertexValueSize(true);
+ Client.run(args, job);
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/NaiveAlgorithmForPathMergeVertex.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/NaiveAlgorithmForPathMergeVertex.java
new file mode 100644
index 0000000..04b8525
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/operator/NaiveAlgorithmForPathMergeVertex.java
@@ -0,0 +1,202 @@
+package edu.uci.ics.genomix.pregelix.operator;
+
+import java.util.Iterator;
+
+import org.apache.hadoop.io.NullWritable;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+import edu.uci.ics.genomix.type.VKmerBytesWritableFactory;
+
+import edu.uci.ics.pregelix.api.graph.Vertex;
+import edu.uci.ics.pregelix.api.job.PregelixJob;
+import edu.uci.ics.genomix.pregelix.client.Client;
+import edu.uci.ics.genomix.pregelix.format.NaiveAlgorithmForPathMergeInputFormat;
+import edu.uci.ics.genomix.pregelix.format.NaiveAlgorithmForPathMergeOutputFormat;
+import edu.uci.ics.genomix.pregelix.io.NaiveAlgorithmMessageWritable;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.pregelix.type.State;
+import edu.uci.ics.genomix.pregelix.util.GraphVertexOperation;
+
+/*
+ * vertexId: BytesWritable
+ * vertexValue: ByteWritable
+ * edgeValue: NullWritable
+ * message: NaiveAlgorithmMessageWritable
+ *
+ * DNA:
+ * A: 00
+ * C: 01
+ * G: 10
+ * T: 11
+ *
+ * succeed node
+ * A 00000001 1
+ * G 00000010 2
+ * C 00000100 4
+ * T 00001000 8
+ * precursor node
+ * A 00010000 16
+ * G 00100000 32
+ * C 01000000 64
+ * T 10000000 128
+ *
+ * For example, ONE LINE in input file: 00,01,10 0001,0010,
+ * That means that vertexId is ACG, its succeed node is A and its precursor node is C.
+ * The succeed node and precursor node will be stored in vertexValue and we don't use edgeValue.
+ * The details about message are in edu.uci.ics.pregelix.example.io.MessageWritable.
+ */
+/**
+ * Naive Algorithm for path merge graph
+ */
+public class NaiveAlgorithmForPathMergeVertex extends Vertex<KmerBytesWritable, ValueStateWritable, NullWritable, NaiveAlgorithmMessageWritable>{
+
+ public static final String KMER_SIZE = "NaiveAlgorithmForPathMergeVertex.kmerSize";
+ public static final String ITERATIONS = "NaiveAlgorithmForPathMergeVertex.iteration";
+ public static int kmerSize = -1;
+ private int maxIteration = -1;
+
+ private NaiveAlgorithmMessageWritable msg = new NaiveAlgorithmMessageWritable();
+
+ private VKmerBytesWritableFactory kmerFactory = new VKmerBytesWritableFactory(1);
+ private VKmerBytesWritable destVertexId = new VKmerBytesWritable(1);
+ private VKmerBytesWritable chainVertexId = new VKmerBytesWritable(1);
+ private VKmerBytesWritable lastKmer = new VKmerBytesWritable(1);
+
+ /**
+ * initiate kmerSize, maxIteration
+ */
+ public void initVertex(){
+ if(kmerSize == -1)
+ kmerSize = getContext().getConfiguration().getInt(KMER_SIZE, 5);
+ if (maxIteration < 0)
+ maxIteration = getContext().getConfiguration().getInt(ITERATIONS, 100);
+ }
+ public void findDestination(){
+ destVertexId.set(msg.getSourceVertexId());
+ }
+ /**
+ * get destination vertex
+ */
+ public VKmerBytesWritable getDestVertexId(KmerBytesWritable vertexId, byte geneCode){
+ return kmerFactory.shiftKmerWithNextCode(vertexId, geneCode);
+ }
+
+ public VKmerBytesWritable getDestVertexIdFromChain(VKmerBytesWritable chainVertexId, byte adjMap){
+ lastKmer.set(kmerFactory.getLastKmerFromChain(kmerSize, chainVertexId));
+ return getDestVertexId(lastKmer, GeneCode.getGeneCodeFromBitMap((byte)(adjMap & 0x0F)));
+ }
+ /**
+ * head send message to all next nodes
+ */
+ public void sendMsgToAllNextNodes(KmerBytesWritable vertexId, byte adjMap){
+ for(byte x = GeneCode.A; x<= GeneCode.T ; x++){
+ if((adjMap & (1 << x)) != 0){
+ destVertexId.set(getDestVertexId(vertexId, x));
+ sendMsg(destVertexId, msg);
+ }
+ }
+ }
+ /**
+ * initiate chain vertex
+ */
+ public void initChainVertex(){
+ if(!msg.isRear()){
+ findDestination();
+ if(GraphVertexOperation.isPathVertex(getVertexValue().getAdjMap())){
+ chainVertexId.set(getVertexId());
+ msg.set(getVertexId(), chainVertexId, getVertexId(), getVertexValue().getAdjMap(), false);
+ sendMsg(destVertexId,msg);
+ }else if(GraphVertexOperation.isRearVertex(getVertexValue().getAdjMap()))
+ voteToHalt();
+ }
+ }
+ /**
+ * head node sends message to path node
+ */
+ public void sendMsgToPathVertex(){
+ if(!msg.isRear()){
+ destVertexId.set(getDestVertexIdFromChain(msg.getChainVertexId(), msg.getAdjMap()));
+ }else{
+ destVertexId.set(msg.getHeadVertexId());
+ }
+ msg.set(getVertexId(), msg.getChainVertexId(), msg.getHeadVertexId(), (byte)0, msg.isRear());
+ sendMsg(destVertexId,msg);
+ }
+ /**
+ * path node sends message back to head node
+ */
+ public void responseMsgToHeadVertex(){
+ if(!msg.isRear()){
+ findDestination();
+ if(GraphVertexOperation.isPathVertex(getVertexValue().getAdjMap())){
+ chainVertexId = kmerFactory.mergeKmerWithNextCode(msg.getChainVertexId(),
+ getVertexId().getGeneCodeAtPosition(kmerSize - 1));
+ deleteVertex(getVertexId());
+ msg.set(getVertexId(), chainVertexId, msg.getHeadVertexId(), getVertexValue().getAdjMap(), false);
+ sendMsg(destVertexId,msg);
+ }
+ else if(GraphVertexOperation.isRearVertex(getVertexValue().getAdjMap())){
+ msg.set(getVertexId(), msg.getChainVertexId(), msg.getHeadVertexId(), (byte)0, true);
+ sendMsg(destVertexId,msg);
+ }
+ }else{// is Rear
+ chainVertexId.set(msg.getSourceVertexId());
+ getVertexValue().set(GraphVertexOperation.updateRightNeighberByVertexId(getVertexValue().getAdjMap(), chainVertexId, kmerSize),
+ State.START_VERTEX, msg.getChainVertexId());
+ setVertexValue(getVertexValue());
+ //String source = msg.getChainVertexId().toString();
+ //System.out.print("");
+ }
+ }
+
+ @Override
+ public void compute(Iterator<NaiveAlgorithmMessageWritable> msgIterator) {
+ initVertex();
+ if (getSuperstep() == 1) {
+ if(GraphVertexOperation.isHeadVertex(getVertexValue().getAdjMap())){
+ msg.set(getVertexId(), chainVertexId, getVertexId(), (byte)0, false);
+ sendMsgToAllNextNodes(getVertexId(), getVertexValue().getAdjMap());
+ }
+ }
+ else if(getSuperstep() == 2){
+ if(msgIterator.hasNext()){
+ msg = msgIterator.next();
+ initChainVertex();
+ }
+ }
+ //head node sends message to path node
+ else if(getSuperstep()%2 == 1 && getSuperstep() <= maxIteration){
+ while (msgIterator.hasNext()){
+ msg = msgIterator.next();
+ sendMsgToPathVertex();
+ }
+ }
+ //path node sends message back to head node
+ else if(getSuperstep()%2 == 0 && getSuperstep() > 2 && getSuperstep() <= maxIteration){
+ while(msgIterator.hasNext()){
+ msg = msgIterator.next();
+ responseMsgToHeadVertex();
+ }
+ }
+ voteToHalt();
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception {
+ PregelixJob job = new PregelixJob(NaiveAlgorithmForPathMergeVertex.class.getSimpleName());
+ job.setVertexClass(NaiveAlgorithmForPathMergeVertex.class);
+ /**
+ * BinaryInput and BinaryOutput
+ */
+ job.setVertexInputFormatClass(NaiveAlgorithmForPathMergeInputFormat.class);
+ job.setVertexOutputFormatClass(NaiveAlgorithmForPathMergeOutputFormat.class);
+ job.setDynamicVertexValueSize(true);
+ job.setOutputKeyClass(KmerBytesWritable.class);
+ job.setOutputValueClass(ValueStateWritable.class);
+ Client.run(args, job);
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/CombineSequenceFile.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/CombineSequenceFile.java
new file mode 100644
index 0000000..c7349dd
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/CombineSequenceFile.java
@@ -0,0 +1,59 @@
+package edu.uci.ics.genomix.pregelix.sequencefile;
+
+import java.io.File;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+public class CombineSequenceFile {
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ // TODO Auto-generated method stub
+ int kmerSize = 5;
+ Configuration conf = new Configuration();
+ FileSystem fileSys = FileSystem.get(conf);
+
+ Path p = new Path("output");
+ //Path p2 = new Path("data/result");
+ Path outFile = new Path("output");
+ SequenceFile.Reader reader;
+ SequenceFile.Writer writer = SequenceFile.createWriter(fileSys, conf,
+ outFile, KmerBytesWritable.class, KmerCountValue.class,
+ CompressionType.NONE);
+ KmerBytesWritable key = new KmerBytesWritable(kmerSize);
+ KmerCountValue value = new KmerCountValue();
+
+ File dir = new File("output");
+ for(File child : dir.listFiles()){
+ String name = child.getAbsolutePath();
+ Path inFile = new Path(p, name);
+ reader = new SequenceFile.Reader(fileSys, inFile, conf);
+ while (reader.next(key, value)) {
+ System.out.println(key.toString()
+ + "\t" + value.toString());
+ writer.append(key, value);
+ }
+ reader.close();
+ }
+ writer.close();
+ System.out.println();
+
+ reader = new SequenceFile.Reader(fileSys, outFile, conf);
+ while (reader.next(key, value)) {
+ System.err.println(key.toString()
+ + "\t" + value.toString());
+ }
+ reader.close();
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/ConvertToSequenceFile.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/ConvertToSequenceFile.java
new file mode 100644
index 0000000..d64b279
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/ConvertToSequenceFile.java
@@ -0,0 +1,42 @@
+package edu.uci.ics.genomix.pregelix.sequencefile;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+
+public class ConvertToSequenceFile {
+ public static void main(String[] args) throws IOException,
+ InterruptedException, ClassNotFoundException {
+
+ Configuration conf = new Configuration();
+ Job job = new Job(conf);
+ job.setJobName("Convert Text");
+ job.setJarByClass(Mapper.class);
+
+ job.setMapperClass(Mapper.class);
+ job.setReducerClass(Reducer.class);
+
+ // increase if you need sorting or a special number of files
+ job.setNumReduceTasks(0);
+
+ job.setOutputKeyClass(LongWritable.class);
+ job.setOutputValueClass(Text.class);
+
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setInputFormatClass(TextInputFormat.class);
+
+ TextInputFormat.addInputPath(job, new Path("data/webmap/part-00000"));
+ SequenceFileOutputFormat.setOutputPath(job, new Path("folder_seq"));
+
+ // submit and wait for completion
+ job.waitForCompletion(true);
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateSmallFile.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateSmallFile.java
new file mode 100644
index 0000000..d97a2fd
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateSmallFile.java
@@ -0,0 +1,48 @@
+package edu.uci.ics.genomix.pregelix.sequencefile;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+public class GenerateSmallFile {
+
+ public static void generateNumOfLinesFromBigFile(Path inFile, Path outFile, int numOfLines) throws IOException{
+ Configuration conf = new Configuration();
+ FileSystem fileSys = FileSystem.get(conf);
+
+ SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, inFile, conf);
+ SequenceFile.Writer writer = SequenceFile.createWriter(fileSys, conf,
+ outFile, KmerBytesWritable.class, KmerCountValue.class,
+ CompressionType.NONE);
+ KmerBytesWritable outKey = new KmerBytesWritable(55);
+ KmerCountValue outValue = new KmerCountValue();
+ int i = 0;
+
+ for(i = 0; i < numOfLines; i++){
+ //System.out.println(i);
+ reader.next(outKey, outValue);
+ writer.append(outKey, outValue);
+ }
+ writer.close();
+ reader.close();
+ }
+ /**
+ * @param args
+ * @throws IOException
+ */
+ public static void main(String[] args) throws IOException {
+ // TODO Auto-generated method stub
+ Path dir = new Path("data");
+ Path inFile = new Path(dir, "part-0");
+ Path outFile = new Path(dir, "part-0-out-5000000");
+ generateNumOfLinesFromBigFile(inFile,outFile,5000000);
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateTextFile.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateTextFile.java
new file mode 100644
index 0000000..8d06e8e
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/sequencefile/GenerateTextFile.java
@@ -0,0 +1,68 @@
+package edu.uci.ics.genomix.pregelix.sequencefile;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.genomix.type.KmerCountValue;
+
+public class GenerateTextFile {
+
+ public static void generateFromPathmergeResult() throws IOException{
+ BufferedWriter bw = new BufferedWriter(new FileWriter("text/log_TreePath"));
+ Configuration conf = new Configuration();
+ FileSystem fileSys = FileSystem.get(conf);
+ for(int i = 0; i < 2; i++){
+ Path path = new Path("output/part-" + i);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, path, conf);
+ KmerBytesWritable key = new KmerBytesWritable(5);
+ ValueStateWritable value = new ValueStateWritable();
+
+ while(reader.next(key, value)){
+ if (key == null || value == null){
+ break;
+ }
+ bw.write(key.toString()
+ + "\t" + value.toString());
+ bw.newLine();
+ }
+ reader.close();
+ }
+ bw.close();
+ }
+ public static void generateFromGraphbuildResult() throws IOException{
+ BufferedWriter bw = new BufferedWriter(new FileWriter("textfile"));
+ Configuration conf = new Configuration();
+ FileSystem fileSys = FileSystem.get(conf);
+ Path path = new Path("data/input/part-0-out-3000000");
+ SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, path, conf);
+ KmerBytesWritable key = new KmerBytesWritable(55);
+ KmerCountValue value = new KmerCountValue();
+
+ while(reader.next(key, value)){
+ if (key == null || value == null){
+ break;
+ }
+ bw.write(key.toString());
+ bw.newLine();
+ }
+ reader.close();
+ bw.close();
+ }
+ /**
+ * @param args
+ * @throws IOException
+ */
+ public static void main(String[] args) throws IOException {
+ generateFromPathmergeResult();
+ generateFromGraphbuildResult();
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testcase/GenerateTestInput.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testcase/GenerateTestInput.java
new file mode 100644
index 0000000..6d81b74
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testcase/GenerateTestInput.java
@@ -0,0 +1,77 @@
+package edu.uci.ics.genomix.pregelix.testcase;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+
+public class GenerateTestInput {
+
+ /**
+ * Simple Path
+ */
+ public static String simplePath(int k, int length, int numLines){
+ RandomString rs = new RandomString(k, length);
+ String output = "";
+ for(int i = 0; i < numLines; i++)
+ output += rs.nextString(0) + "\r\n";
+ return output;
+ }
+ /**
+ * Tree Path
+ */
+ public static String treePath(int k, int x, int y, int z){
+ RandomString rs = new RandomString(k, x + y + k - 1);
+ String s1 = rs.nextString(0);
+ rs.setLength(x + y + z + k - 1);
+ rs.addString(s1.substring(0, x));
+ String s2 = rs.nextString(x);
+ rs.setLength(x + y + z + k - 1);
+ rs.addString(s2.substring(0,x + y));
+ String s3 = rs.nextString(x + y);
+ return s1 + "\r\n" + s2 + "\r\n" + s3;
+ }
+ /**
+ * Cycle Path
+ */
+ public static String cyclePath(int k, int length){
+ RandomString rs = new RandomString(k, length);
+ String s1 = rs.nextString(0);
+ String s2 = s1 + s1.substring(1, k + 1);
+ return s2;
+ }
+ /**
+ * Bridge Path
+ */
+ public static String bridgePath(int k, int x){
+ RandomString rs = new RandomString(k, x + k + 2 + k - 1);
+ String s1 = rs.nextString(0);
+ rs.setLength(x + k + 2);
+ rs.addString(s1.substring(0, k + 2));
+ String s2 = rs.nextString(k + 2) + s1.substring(x + k + 2, x + k + 2 + k - 1);
+ return s1 + "\r\n" + s2;
+ }
+
+ public static void main(String[] args) {
+ // TODO Auto-generated method stub
+ OutputStreamWriter writer;
+ try {
+ writer = new OutputStreamWriter(new FileOutputStream("graph/55/SinglePath_55"));
+ writer.write(simplePath(55,60,1));
+ writer.close();
+ writer = new OutputStreamWriter(new FileOutputStream("graph/55/SimplePath_55"));
+ writer.write(simplePath(55,60,3));
+ writer.close();
+ writer = new OutputStreamWriter(new FileOutputStream("graph/55/TreePath"));
+ writer.write(treePath(55, 5, 5, 3));
+ writer.close();
+ writer = new OutputStreamWriter(new FileOutputStream("graph/55/CyclePath"));
+ writer.write(cyclePath(55,60));
+ writer.close();
+ writer = new OutputStreamWriter(new FileOutputStream("graph/55/BridgePath"));
+ writer.write(bridgePath(55,2));
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testcase/RandomString.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testcase/RandomString.java
new file mode 100644
index 0000000..337c5d8
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/testcase/RandomString.java
@@ -0,0 +1,63 @@
+package edu.uci.ics.genomix.pregelix.testcase;
+
+import java.util.ArrayList;
+import java.util.Random;
+
+public class RandomString
+{
+
+ private static final char[] symbols = new char[4];
+
+ static {
+ symbols[0] = 'A';
+ symbols[1] = 'C';
+ symbols[2] = 'G';
+ symbols[3] = 'T';
+ }
+
+ private final Random random = new Random();
+
+ private char[] buf;
+
+ private ArrayList<String> existKmer = new ArrayList<String>();;
+
+ private int k;
+
+ public RandomString(int k, int length)
+ {
+ if (length < 1)
+ throw new IllegalArgumentException("length < 1: " + length);
+ buf = new char[length];
+ this.k = k;
+ }
+
+ public String nextString(int startIdx)
+ {
+ String tmp = "";
+ for (int idx = startIdx; idx < buf.length;){
+ buf[idx] = symbols[random.nextInt(4)];
+ if(idx >= k - 1){
+ tmp = new String(buf, idx-k+1, k);
+ if(!existKmer.contains(tmp)){
+ existKmer.add(tmp);
+ idx++;
+ }
+ }
+ else
+ idx++;
+ }
+
+ return new String(buf);
+ }
+
+ public void setLength(int length){
+ buf = new char[length];
+ }
+
+ public void addString(String s){
+ char[] tmp = s.toCharArray();
+ for(int i = 0; i < tmp.length; i++)
+ buf[i] = tmp[i];
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/type/Message.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/type/Message.java
new file mode 100644
index 0000000..f556a73
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/type/Message.java
@@ -0,0 +1,27 @@
+package edu.uci.ics.genomix.pregelix.type;
+
+public class Message {
+
+ public static final int NON = 0;
+ public static final int START = 1;
+ public static final int END = 2;
+
+ public final static class MESSAGE_CONTENT{
+
+ public static String getContentFromCode(int code){
+ String r = "";
+ switch(code){
+ case NON:
+ r = "NON";
+ break;
+ case START:
+ r = "START";
+ break;
+ case END:
+ r = "END";
+ break;
+ }
+ return r;
+ }
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/type/State.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/type/State.java
new file mode 100644
index 0000000..61bcb0c
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/type/State.java
@@ -0,0 +1,46 @@
+package edu.uci.ics.genomix.pregelix.type;
+
+public class State {
+ public static final int NON_VERTEX = 0;
+ public static final int START_VERTEX = 1;
+ public static final int END_VERTEX = 2;
+ public static final int MID_VERTEX = 3;
+ public static final int TODELETE = 4;
+ public static final int FINAL_VERTEX = 5;
+ public static final int FINAL_DELETE = 6;
+ public static final int KILL_SELF = 7;
+
+ public final static class STATE_CONTENT{
+
+ public static String getContentFromCode(int code){
+ String r = "";
+ switch(code){
+ case NON_VERTEX:
+ r = "NON_VERTEX";
+ break;
+ case START_VERTEX:
+ r = "START_VERTEX";
+ break;
+ case END_VERTEX:
+ r = "END_VERTEX";
+ break;
+ case MID_VERTEX:
+ r = "MID_VERTEX";
+ break;
+ case TODELETE:
+ r = "TODELETE";
+ break;
+ case FINAL_VERTEX:
+ r = "FINAL_VERTEX";
+ break;
+ case FINAL_DELETE:
+ r = "FINAL_DELETE";
+ break;
+ case KILL_SELF:
+ r = "KILL_SELF";
+ break;
+ }
+ return r;
+ }
+ }
+}
diff --git a/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/util/GraphVertexOperation.java b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/util/GraphVertexOperation.java
new file mode 100644
index 0000000..5cb0c2b
--- /dev/null
+++ b/genomix/genomix-pregelix/src/main/java/edu/uci/ics/genomix/pregelix/util/GraphVertexOperation.java
@@ -0,0 +1,62 @@
+package edu.uci.ics.genomix.pregelix.util;
+
+import edu.uci.ics.genomix.type.GeneCode;
+import edu.uci.ics.genomix.type.VKmerBytesWritable;
+
+public class GraphVertexOperation {
+
+ /**
+ * generate the valid data(byte[]) from BytesWritable
+ */
+ public static byte[] generateValidDataFromBytesWritable(VKmerBytesWritable bw){
+ byte[] wholeBytes = bw.getBytes();
+ int validNum = bw.getLength();
+ byte[] validBytes = new byte[validNum];
+ for(int i = 0; i < validNum; i++)
+ validBytes[i] = wholeBytes[i];
+ return validBytes;
+ }
+ /**
+ * Single Vertex: in-degree = out-degree = 1
+ * @param vertexValue
+ */
+ public static boolean isPathVertex(byte value){
+ if(GeneCode.inDegree(value) == 1 && GeneCode.outDegree(value) == 1)
+ return true;
+ return false;
+ }
+ /**
+ * Head Vertex: out-degree > 0,
+ * @param vertexValue
+ */
+ public static boolean isHeadVertex(byte value){
+ if(GeneCode.outDegree(value) > 0 && !isPathVertex(value))
+ return true;
+ return false;
+ }
+ /**
+ * Rear Vertex: in-degree > 0,
+ * @param vertexValue
+ */
+ public static boolean isRearVertex(byte value){
+ if(GeneCode.inDegree(value) > 0 && !isPathVertex(value))
+ return true;
+ return false;
+ }
+ /**
+ * update right neighber based on next vertexId
+ */
+ public static byte updateRightNeighberByVertexId(byte oldVertexValue, VKmerBytesWritable neighberVertex, int k){
+ byte geneCode = neighberVertex.getGeneCodeAtPosition(k-1);
+
+ byte newBit = GeneCode.getAdjBit(geneCode);
+ return (byte) ((byte)(oldVertexValue & 0xF0) | (byte) (newBit & 0x0F));
+ }
+ /**
+ * update right neighber
+ */
+ public static byte updateRightNeighber(byte oldVertexValue, byte newVertexValue){
+ return (byte) ((byte)(oldVertexValue & 0xF0) | (byte) (newVertexValue & 0x0F));
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobGen/JobGenerator.java b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobGen/JobGenerator.java
new file mode 100644
index 0000000..f03a4ab
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobGen/JobGenerator.java
@@ -0,0 +1,93 @@
+package edu.uci.ics.genomix.pregelix.JobGen;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+import edu.uci.ics.genomix.pregelix.format.LogAlgorithmForPathMergeOutputFormat;
+import edu.uci.ics.genomix.pregelix.format.NaiveAlgorithmForPathMergeInputFormat;
+import edu.uci.ics.genomix.pregelix.format.NaiveAlgorithmForPathMergeOutputFormat;
+import edu.uci.ics.genomix.pregelix.format.LogAlgorithmForPathMergeInputFormat;
+import edu.uci.ics.genomix.pregelix.io.ValueStateWritable;
+import edu.uci.ics.genomix.pregelix.operator.LoadGraphVertex;
+import edu.uci.ics.genomix.pregelix.operator.LogAlgorithmForPathMergeVertex;
+import edu.uci.ics.genomix.pregelix.operator.NaiveAlgorithmForPathMergeVertex;
+import edu.uci.ics.genomix.type.KmerBytesWritable;
+import edu.uci.ics.pregelix.api.job.PregelixJob;
+
+
+public class JobGenerator {
+
+ private static String outputBase = "src/test/resources/jobs/";
+ private static String HDFS_INPUTPATH = "/webmap";
+ private static String HDFS_OUTPUTPAH = "/result";
+
+ private static void generateLoadGraphJob(String jobName, String outputPath) throws IOException {
+ PregelixJob job = new PregelixJob(jobName);
+ job.setVertexClass(LoadGraphVertex.class);
+ job.setVertexInputFormatClass(NaiveAlgorithmForPathMergeInputFormat.class);
+ job.setVertexOutputFormatClass(NaiveAlgorithmForPathMergeOutputFormat.class);
+ job.setDynamicVertexValueSize(true);
+ job.setOutputKeyClass(KmerBytesWritable.class);
+ job.setOutputValueClass(ValueStateWritable.class);
+ FileInputFormat.setInputPaths(job, HDFS_INPUTPATH);
+ FileOutputFormat.setOutputPath(job, new Path(HDFS_OUTPUTPAH));
+ job.getConfiguration().writeXml(new FileOutputStream(new File(outputPath)));
+ }
+
+ private static void genLoadGraph() throws IOException {
+ generateLoadGraphJob("LoadGraph", outputBase + "LoadGraph.xml");
+ }
+
+ private static void generateMergeGraphJob(String jobName, String outputPath) throws IOException {
+ PregelixJob job = new PregelixJob(jobName);
+ job.setVertexClass(NaiveAlgorithmForPathMergeVertex.class);
+ job.setVertexInputFormatClass(NaiveAlgorithmForPathMergeInputFormat.class);
+ job.setVertexOutputFormatClass(NaiveAlgorithmForPathMergeOutputFormat.class);
+ job.setDynamicVertexValueSize(true);
+ job.setOutputKeyClass(KmerBytesWritable.class);
+ job.setOutputValueClass(ValueStateWritable.class);
+ FileInputFormat.setInputPaths(job, HDFS_INPUTPATH);
+ FileOutputFormat.setOutputPath(job, new Path(HDFS_OUTPUTPAH));
+ job.getConfiguration().setInt(NaiveAlgorithmForPathMergeVertex.KMER_SIZE, 5);
+ job.getConfiguration().writeXml(new FileOutputStream(new File(outputPath)));
+ }
+
+ private static void genMergeGraph() throws IOException {
+ generateMergeGraphJob("MergeGraph", outputBase + "MergeGraph.xml");
+ }
+
+ private static void generateLogAlgorithmForMergeGraphJob(String jobName, String outputPath) throws IOException {
+ PregelixJob job = new PregelixJob(jobName);
+ job.setVertexClass(LogAlgorithmForPathMergeVertex.class);
+ job.setVertexInputFormatClass(LogAlgorithmForPathMergeInputFormat.class);
+ job.setVertexOutputFormatClass(LogAlgorithmForPathMergeOutputFormat.class);
+ job.setDynamicVertexValueSize(true);
+ job.setOutputKeyClass(KmerBytesWritable.class);
+ job.setOutputValueClass(ValueStateWritable.class);
+ FileInputFormat.setInputPaths(job, HDFS_INPUTPATH);
+ FileOutputFormat.setOutputPath(job, new Path(HDFS_OUTPUTPAH));
+ job.getConfiguration().setInt(LogAlgorithmForPathMergeVertex.KMER_SIZE, 5);
+ job.getConfiguration().writeXml(new FileOutputStream(new File(outputPath)));
+ }
+
+ private static void genLogAlgorithmForMergeGraph() throws IOException {
+ generateLogAlgorithmForMergeGraphJob("LogAlgorithmForMergeGraph", outputBase + "LogAlgorithmForMergeGraph.xml");
+ }
+
+ /**
+ * @param args
+ * @throws IOException
+ */
+ public static void main(String[] args) throws IOException {
+ // TODO Auto-generated method stub
+ //genLoadGraph();
+ //genMergeGraph();
+ genLogAlgorithmForMergeGraph();
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/RunJobTestCase.java b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/RunJobTestCase.java
new file mode 100644
index 0000000..a5ddce3
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/RunJobTestCase.java
@@ -0,0 +1,168 @@
+package edu.uci.ics.genomix.pregelix.JobRun;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.junit.Test;
+
+import edu.uci.ics.genomix.pregelix.example.util.TestUtils;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.pregelix.api.job.PregelixJob;
+import edu.uci.ics.pregelix.core.jobgen.JobGen;
+import edu.uci.ics.pregelix.core.jobgen.JobGenInnerJoin;
+import edu.uci.ics.pregelix.core.jobgen.JobGenOuterJoin;
+import edu.uci.ics.pregelix.core.jobgen.JobGenOuterJoinSingleSort;
+import edu.uci.ics.pregelix.core.jobgen.JobGenOuterJoinSort;
+import edu.uci.ics.pregelix.core.util.PregelixHyracksIntegrationUtil;
+import edu.uci.ics.pregelix.dataflow.util.IterationUtils;
+
+public class RunJobTestCase extends TestCase{
+
+ private static final String NC1 = "nc1";
+ private static final String HYRACKS_APP_NAME = "pregelix";
+ private static String HDFS_INPUTPATH = "/webmap";
+ private static String HDFS_OUTPUTPAH = "/result";
+
+ private final PregelixJob job;
+ private JobGen[] giraphJobGens;
+ private final String resultFileName;
+ private final String expectedFileName;
+ private final String jobFile;
+
+
+
+ public RunJobTestCase(String hadoopConfPath, String jobName, String jobFile, String resultFile, String expectedFile)
+ throws Exception {
+ super("test");
+ this.jobFile = jobFile;
+ this.job = new PregelixJob("test");
+ this.job.getConfiguration().addResource(new Path(jobFile));
+ this.job.getConfiguration().addResource(new Path(hadoopConfPath));
+ Path[] inputPaths = FileInputFormat.getInputPaths(job);
+ if (inputPaths[0].toString().endsWith(HDFS_INPUTPATH)) {
+ FileInputFormat.setInputPaths(job, HDFS_INPUTPATH);
+ FileOutputFormat.setOutputPath(job, new Path(HDFS_OUTPUTPAH));
+ }
+
+ job.setJobName(jobName);
+ this.resultFileName = resultFile;
+ this.expectedFileName = expectedFile;
+ giraphJobGens = new JobGen[1];
+ giraphJobGens[0] = new JobGenOuterJoin(job);
+ /*waitawhile();
+ giraphJobGens[1] = new JobGenInnerJoin(job);
+ waitawhile();
+ giraphJobGens[2] = new JobGenOuterJoinSort(job);
+ waitawhile();
+ giraphJobGens[3] = new JobGenOuterJoinSingleSort(job);*/
+ }
+
+ private void waitawhile() throws InterruptedException {
+ synchronized (this) {
+ this.wait(20);
+ }
+ }
+ @Test
+ public void test() throws Exception {
+ setUp();
+
+ for (JobGen jobGen : giraphJobGens) {
+ FileSystem dfs = FileSystem.get(job.getConfiguration());
+ dfs.delete(new Path(HDFS_OUTPUTPAH), true);
+ runCreate(jobGen);
+ runDataLoad(jobGen);
+ int i = 1;
+ boolean terminate = false;
+ do {
+ runLoopBodyIteration(jobGen, i);
+ terminate = IterationUtils.readTerminationState(job.getConfiguration(), jobGen.getJobId());
+ i++;
+ } while (!terminate);
+ runIndexScan(jobGen);
+ runHDFSWRite(jobGen);
+ runCleanup(jobGen);
+ compareResults();
+ }
+ tearDown();
+ waitawhile();
+ }
+
+ private void runCreate(JobGen jobGen) throws Exception {
+ try {
+ JobSpecification treeCreateJobSpec = jobGen.generateCreatingJob();
+ PregelixHyracksIntegrationUtil.runJob(treeCreateJobSpec, HYRACKS_APP_NAME);
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ private void runDataLoad(JobGen jobGen) throws Exception {
+ try {
+ JobSpecification bulkLoadJobSpec = jobGen.generateLoadingJob();
+ PregelixHyracksIntegrationUtil.runJob(bulkLoadJobSpec, HYRACKS_APP_NAME);
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ private void runLoopBodyIteration(JobGen jobGen, int iteration) throws Exception {
+ try {
+ JobSpecification loopBody = jobGen.generateJob(iteration);
+ PregelixHyracksIntegrationUtil.runJob(loopBody, HYRACKS_APP_NAME);
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ private void runIndexScan(JobGen jobGen) throws Exception {
+ try {
+ JobSpecification scanSortPrintJobSpec = jobGen.scanIndexPrintGraph(NC1, resultFileName);
+ PregelixHyracksIntegrationUtil.runJob(scanSortPrintJobSpec, HYRACKS_APP_NAME);
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ private void runHDFSWRite(JobGen jobGen) throws Exception {
+ try {
+ JobSpecification scanSortPrintJobSpec = jobGen.scanIndexWriteGraph();
+ PregelixHyracksIntegrationUtil.runJob(scanSortPrintJobSpec, HYRACKS_APP_NAME);
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ private void runCleanup(JobGen jobGen) throws Exception {
+ try {
+ JobSpecification[] cleanups = jobGen.generateCleanup();
+ runJobArray(cleanups);
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ private void runJobArray(JobSpecification[] jobs) throws Exception {
+ for (JobSpecification job : jobs) {
+ PregelixHyracksIntegrationUtil.runJob(job, HYRACKS_APP_NAME);
+ }
+ }
+
+ private void compareResults() throws Exception {
+ TestUtils.compareWithResult(new File(resultFileName), new File(expectedFileName));
+ }
+
+ public String toString() {
+ return jobFile;
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/RunJobTestSuite.java b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/RunJobTestSuite.java
new file mode 100644
index 0000000..fd0749a
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/JobRun/RunJobTestSuite.java
@@ -0,0 +1,194 @@
+package edu.uci.ics.genomix.pregelix.JobRun;
+
+import java.io.BufferedReader;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import junit.framework.Test;
+import junit.framework.TestResult;
+import junit.framework.TestSuite;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.mapred.JobConf;
+
+import edu.uci.ics.pregelix.core.jobgen.clusterconfig.ClusterConfig;
+import edu.uci.ics.pregelix.core.util.PregelixHyracksIntegrationUtil;
+
+public class RunJobTestSuite extends TestSuite {
+
+ private static final Logger LOGGER = Logger.getLogger(RunJobTestSuite.class
+ .getName());
+
+ private static final String ACTUAL_RESULT_DIR = "actual";
+ private static final String EXPECTED_RESULT_DIR = "src/test/resources/expected";
+ private static final String PATH_TO_HADOOP_CONF = "src/test/resources/hadoop/conf";
+ private static final String PATH_TO_CLUSTER_STORE = "src/test/resources/cluster/stores.properties";
+ private static final String PATH_TO_CLUSTER_PROPERTIES = "src/test/resources/cluster/cluster.properties";
+ private static final String PATH_TO_JOBS = "src/test/resources/jobs/";
+ private static final String PATH_TO_IGNORE = "src/test/resources/ignore.txt";
+ private static final String PATH_TO_ONLY = "src/test/resources/only.txt";
+ private static final String FILE_EXTENSION_OF_RESULTS = "result";
+
+ private static final String DATA_PATH = "data/sequencefile/LongPath";
+ private static final String HDFS_PATH = "/webmap/";
+
+ private static final String HYRACKS_APP_NAME = "pregelix";
+ private static final String HADOOP_CONF_PATH = ACTUAL_RESULT_DIR
+ + File.separator + "conf.xml";
+ private MiniDFSCluster dfsCluster;
+
+ private JobConf conf = new JobConf();
+ private int numberOfNC = 2;
+
+ public void setUp() throws Exception {
+ ClusterConfig.setStorePath(PATH_TO_CLUSTER_STORE);
+ ClusterConfig.setClusterPropertiesPath(PATH_TO_CLUSTER_PROPERTIES);
+ cleanupStores();
+ PregelixHyracksIntegrationUtil.init("src/test/resources/topology.xml");
+ LOGGER.info("Hyracks mini-cluster started");
+ FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
+ FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
+ startHDFS();
+ }
+
+ private void cleanupStores() throws IOException {
+ FileUtils.forceMkdir(new File("teststore"));
+ FileUtils.forceMkdir(new File("build"));
+ FileUtils.cleanDirectory(new File("teststore"));
+ FileUtils.cleanDirectory(new File("build"));
+ }
+
+ private void startHDFS() throws IOException {
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
+ conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
+ FileSystem lfs = FileSystem.getLocal(new Configuration());
+ lfs.delete(new Path("build"), true);
+ System.setProperty("hadoop.log.dir", "logs");
+ dfsCluster = new MiniDFSCluster(conf, numberOfNC, true, null);
+ FileSystem dfs = FileSystem.get(conf);
+ Path src = new Path(DATA_PATH);
+ Path dest = new Path(HDFS_PATH);
+ dfs.mkdirs(dest);
+ dfs.copyFromLocalFile(src, dest);
+
+ DataOutputStream confOutput = new DataOutputStream(
+ new FileOutputStream(new File(HADOOP_CONF_PATH)));
+ conf.writeXml(confOutput);
+ confOutput.flush();
+ confOutput.close();
+ }
+
+ /**
+ * cleanup hdfs cluster
+ */
+ private void cleanupHDFS() throws Exception {
+ dfsCluster.shutdown();
+ }
+
+ public void tearDown() throws Exception {
+ PregelixHyracksIntegrationUtil.deinit();
+ LOGGER.info("Hyracks mini-cluster shut down");
+ cleanupHDFS();
+ }
+
+ public static Test suite() throws Exception {
+ List<String> ignores = getFileList(PATH_TO_IGNORE);
+ List<String> onlys = getFileList(PATH_TO_ONLY);
+ File testData = new File(PATH_TO_JOBS);
+ File[] queries = testData.listFiles();
+ RunJobTestSuite testSuite = new RunJobTestSuite();
+ testSuite.setUp();
+ boolean onlyEnabled = false;
+
+ if (onlys.size() > 0) {
+ onlyEnabled = true;
+ }
+ for (File qFile : queries) {
+ if (isInList(ignores, qFile.getName()))
+ continue;
+
+ if (qFile.isFile()) {
+ if (onlyEnabled && !isInList(onlys, qFile.getName())) {
+ continue;
+ } else {
+ String resultFileName = ACTUAL_RESULT_DIR + File.separator
+ + jobExtToResExt(qFile.getName());
+ String expectedFileName = EXPECTED_RESULT_DIR
+ + File.separator + jobExtToResExt(qFile.getName());
+ testSuite.addTest(new RunJobTestCase(HADOOP_CONF_PATH,
+ qFile.getName(),
+ qFile.getAbsolutePath().toString(), resultFileName,
+ expectedFileName));
+ }
+ }
+ }
+ return testSuite;
+ }
+
+ /**
+ * Runs the tests and collects their result in a TestResult.
+ */
+ @Override
+ public void run(TestResult result) {
+ try {
+ int testCount = countTestCases();
+ for (int i = 0; i < testCount; i++) {
+ // cleanupStores();
+ Test each = this.testAt(i);
+ if (result.shouldStop())
+ break;
+ runTest(each, result);
+ }
+
+ tearDown();
+ } catch (Exception e) {
+ throw new IllegalStateException(e);
+ }
+
+ }
+
+ protected static List<String> getFileList(String ignorePath)
+ throws FileNotFoundException, IOException {
+ BufferedReader reader = new BufferedReader(new FileReader(ignorePath));
+ String s = null;
+ List<String> ignores = new ArrayList<String>();
+ while ((s = reader.readLine()) != null) {
+ ignores.add(s);
+ }
+ reader.close();
+ return ignores;
+ }
+
+ private static String jobExtToResExt(String fname) {
+ int dot = fname.lastIndexOf('.');
+ return fname.substring(0, dot + 1) + FILE_EXTENSION_OF_RESULTS;
+ }
+
+ private static boolean isInList(List<String> onlys, String name) {
+ for (String only : onlys)
+ if (name.indexOf(only) >= 0)
+ return true;
+ return false;
+ }
+
+ public JobConf getConf() {
+ return conf;
+ }
+
+ public void setConf(JobConf conf) {
+ this.conf = conf;
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/example/util/TestUtils.java b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/example/util/TestUtils.java
new file mode 100644
index 0000000..8ac1b09
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/java/edu/uci/ics/genomix/pregelix/example/util/TestUtils.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.genomix.pregelix.example.util;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+
+public class TestUtils {
+
+ public static void compareWithResultDir(File expectedFileDir, File actualFileDir) throws Exception {
+ String[] fileNames = expectedFileDir.list();
+ for (String fileName : fileNames) {
+ compareWithResult(new File(expectedFileDir, fileName), new File(actualFileDir, fileName));
+ }
+ }
+
+ public static void compareWithResult(File expectedFile, File actualFile) throws Exception {
+ BufferedReader readerExpected = new BufferedReader(new FileReader(expectedFile));
+ BufferedReader readerActual = new BufferedReader(new FileReader(actualFile));
+ String lineExpected, lineActual;
+ int num = 1;
+ try {
+ while ((lineExpected = readerExpected.readLine()) != null) {
+ lineActual = readerActual.readLine();
+ if (lineActual == null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< " + lineExpected + "\n> ");
+ }
+ if (!equalStrings(lineExpected, lineActual)) {
+ throw new Exception("Result for changed at line " + num + ":\n< " + lineExpected + "\n> "
+ + lineActual);
+ }
+ ++num;
+ }
+ lineActual = readerActual.readLine();
+ if (lineActual != null) {
+ throw new Exception("Actual result changed at line " + num + ":\n< \n> " + lineActual);
+ }
+ } finally {
+ readerExpected.close();
+ readerActual.close();
+ }
+ }
+
+ private static boolean equalStrings(String s1, String s2) {
+ String[] rowsOne = s1.split("\n");
+ String[] rowsTwo = s2.split("\n");
+
+ if (rowsOne.length != rowsTwo.length)
+ return false;
+
+ for (int i = 0; i < rowsOne.length; i++) {
+ String row1 = rowsOne[i];
+ String row2 = rowsTwo[i];
+
+ if (row1.equals(row2))
+ continue;
+
+ boolean spaceOrTab = false;
+ spaceOrTab = row1.contains(" ");
+ String[] fields1 = spaceOrTab ? row1.split(" ") : row1.split("\t");
+ String[] fields2 = spaceOrTab ? row2.split(" ") : row2.split("\t");
+
+ for (int j = 0; j < fields1.length; j++) {
+ if (fields1[j].equals(fields2[j])) {
+ continue;
+ } else if (fields1[j].indexOf('.') < 0) {
+ return false;
+ } else {
+ Double double1 = Double.parseDouble(fields1[j]);
+ Double double2 = Double.parseDouble(fields2[j]);
+ float float1 = (float) double1.doubleValue();
+ float float2 = (float) double2.doubleValue();
+
+ if (Math.abs(float1 - float2) < 1.0e-7)
+ continue;
+ else {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+}
diff --git a/genomix/genomix-pregelix/src/test/resources/cluster/cluster.properties b/genomix/genomix-pregelix/src/test/resources/cluster/cluster.properties
new file mode 100644
index 0000000..14f8bd4
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/cluster/cluster.properties
@@ -0,0 +1,37 @@
+#The CC port for Hyracks clients
+CC_CLIENTPORT=3099
+
+#The CC port for Hyracks cluster management
+CC_CLUSTERPORT=1099
+
+#The directory of hyracks binaries
+HYRACKS_HOME=../../../../hyracks
+
+#The tmp directory for cc to install jars
+CCTMP_DIR=/tmp/t1
+
+#The tmp directory for nc to install jars
+NCTMP_DIR=/tmp/t2
+
+#The directory to put cc logs
+CCLOGS_DIR=$CCTMP_DIR/logs
+
+#The directory to put nc logs
+NCLOGS_DIR=$NCTMP_DIR/logs
+
+#Comma separated I/O directories for the spilling of external sort
+IO_DIRS="/tmp/t3,/tmp/t4"
+
+#The JAVA_HOME
+JAVA_HOME=$JAVA_HOME
+
+#The frame size of the internal dataflow engine
+FRAME_SIZE=65536
+
+#CC JAVA_OPTS
+CCJAVA_OPTS="-Xdebug -Xrunjdwp:transport=dt_socket,address=7001,server=y,suspend=n -Xmx3g -Djava.util.logging.config.file=logging.properties"
+# Yourkit option: -agentpath:/grid/0/dev/vborkar/tools/yjp-10.0.4/bin/linux-x86-64/libyjpagent.so=port=20001"
+
+#NC JAVA_OPTS
+NCJAVA_OPTS="-Xdebug -Xrunjdwp:transport=dt_socket,address=7002,server=y,suspend=n -Xmx1g -Djava.util.logging.config.file=logging.properties"
+
diff --git a/genomix/genomix-pregelix/src/test/resources/cluster/stores.properties b/genomix/genomix-pregelix/src/test/resources/cluster/stores.properties
new file mode 100644
index 0000000..daf881e
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/cluster/stores.properties
@@ -0,0 +1 @@
+store=teststore
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/src/test/resources/hadoop/conf/core-site.xml b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/core-site.xml
new file mode 100644
index 0000000..47dfac5
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/core-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+<property>
+ <name>fs.default.name</name>
+ <value>hdfs://127.0.0.1:31888</value>
+</property>
+<property>
+ <name>hadoop.tmp.dir</name>
+ <value>/tmp/hadoop</value>
+</property>
+
+
+</configuration>
diff --git a/genomix/genomix-pregelix/src/test/resources/hadoop/conf/hdfs-site.xml b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/hdfs-site.xml
new file mode 100644
index 0000000..a36b6d7
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/hdfs-site.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+<property>
+ <name>dfs.replication</name>
+ <value>1</value>
+</property>
+
+<property>
+ <name>dfs.block.size</name>
+ <value>655360</value>
+</property>
+
+</configuration>
diff --git a/genomix/genomix-pregelix/src/test/resources/hadoop/conf/log4j.properties b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/log4j.properties
new file mode 100755
index 0000000..d5e6004
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/log4j.properties
@@ -0,0 +1,94 @@
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=FATAL,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+
+# Logging Threshold
+log4j.threshhold=FATAL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Rolling File Appender
+#
+
+#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
+#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Logfile size and and 30-day backups
+#log4j.appender.RFA.MaxFileSize=1MB
+#log4j.appender.RFA.MaxBackupIndex=30
+
+#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+#
+# FSNamesystem Audit logging
+# All audit events are logged at INFO level
+#
+log4j.logger.org.apache.hadoop.fs.FSNamesystem.audit=WARN
+
+# Custom Logging levels
+
+#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
+#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
+#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG
+
+# Jets3t library
+log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
diff --git a/genomix/genomix-pregelix/src/test/resources/hadoop/conf/mapred-site.xml b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/mapred-site.xml
new file mode 100644
index 0000000..f75b072
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/hadoop/conf/mapred-site.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+ <property>
+ <name>mapred.job.tracker</name>
+ <value>localhost:29007</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.map.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.tasktracker.reduce.tasks.maximum</name>
+ <value>20</value>
+ </property>
+ <property>
+ <name>mapred.max.split.size</name>
+ <value>128000</value>
+ </property>
+
+</configuration>
diff --git a/genomix/genomix-pregelix/src/test/resources/hyracks-deployment.properties b/genomix/genomix-pregelix/src/test/resources/hyracks-deployment.properties
new file mode 100644
index 0000000..9c42b89
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/hyracks-deployment.properties
@@ -0,0 +1,2 @@
+#cc.bootstrap.class=edu.uci.ics.asterix.hyracks.bootstrap.CCBootstrapImpl
+nc.bootstrap.class=edu.uci.ics.pregelix.runtime.bootstrap.NCBootstrapImpl
diff --git a/genomix/genomix-pregelix/src/test/resources/ignore.txt b/genomix/genomix-pregelix/src/test/resources/ignore.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/ignore.txt
diff --git a/genomix/genomix-pregelix/src/test/resources/log4j.properties b/genomix/genomix-pregelix/src/test/resources/log4j.properties
new file mode 100755
index 0000000..d5e6004
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/log4j.properties
@@ -0,0 +1,94 @@
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=FATAL,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+
+# Logging Threshold
+log4j.threshhold=FATAL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Rollver at midnight
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Rolling File Appender
+#
+
+#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
+#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
+
+# Logfile size and and 30-day backups
+#log4j.appender.RFA.MaxFileSize=1MB
+#log4j.appender.RFA.MaxBackupIndex=30
+
+#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
+#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+#
+# FSNamesystem Audit logging
+# All audit events are logged at INFO level
+#
+log4j.logger.org.apache.hadoop.fs.FSNamesystem.audit=WARN
+
+# Custom Logging levels
+
+#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
+#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
+#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG
+
+# Jets3t library
+log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
diff --git a/genomix/genomix-pregelix/src/test/resources/logging.properties b/genomix/genomix-pregelix/src/test/resources/logging.properties
new file mode 100644
index 0000000..b8f2be9
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/logging.properties
@@ -0,0 +1,66 @@
+############################################################
+# Default Logging Configuration File
+#
+# You can use a different file by specifying a filename
+# with the java.util.logging.config.file system property.
+# For example java -Djava.util.logging.config.file=myfile
+############################################################
+
+############################################################
+# Global properties
+############################################################
+
+# "handlers" specifies a comma separated list of log Handler
+# classes. These handlers will be installed during VM startup.
+# Note that these classes must be on the system classpath.
+# By default we only configure a ConsoleHandler, which will only
+# show messages at the INFO and above levels.
+
+handlers= java.util.logging.ConsoleHandler
+
+# To also add the FileHandler, use the following line instead.
+
+# handlers= java.util.logging.FileHandler, java.util.logging.ConsoleHandler
+
+# Default global logging level.
+# This specifies which kinds of events are logged across
+# all loggers. For any given facility this global level
+# can be overriden by a facility specific level
+# Note that the ConsoleHandler also has a separate level
+# setting to limit messages printed to the console.
+
+.level= SEVERE
+# .level= INFO
+# .level= FINE
+# .level = FINEST
+
+############################################################
+# Handler specific properties.
+# Describes specific configuration info for Handlers.
+############################################################
+
+# default file output is in user's home directory.
+
+# java.util.logging.FileHandler.pattern = %h/java%u.log
+# java.util.logging.FileHandler.limit = 50000
+# java.util.logging.FileHandler.count = 1
+# java.util.logging.FileHandler.formatter = java.util.logging.XMLFormatter
+
+# Limit the message that are printed on the console to FINE and above.
+
+java.util.logging.ConsoleHandler.level = FINEST
+java.util.logging.ConsoleHandler.formatter = java.util.logging.SimpleFormatter
+
+
+############################################################
+# Facility specific properties.
+# Provides extra control for each logger.
+############################################################
+
+# For example, set the com.xyz.foo logger to only log SEVERE
+# messages:
+
+#edu.uci.ics.asterix.level = FINE
+#edu.uci.ics.algebricks.level = FINE
+edu.uci.ics.hyracks.level = SEVERE
+#edu.uci.ics.hyracks.control.nc.net.level = FINE
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/src/test/resources/only.txt b/genomix/genomix-pregelix/src/test/resources/only.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/only.txt
diff --git a/genomix/genomix-pregelix/src/test/resources/topology.xml b/genomix/genomix-pregelix/src/test/resources/topology.xml
new file mode 100755
index 0000000..2a6c380
--- /dev/null
+++ b/genomix/genomix-pregelix/src/test/resources/topology.xml
@@ -0,0 +1,7 @@
+<cluster-topology>
+ <network-switch name="Global">
+ <network-switch name="local">
+ <terminal name="127.1.0.1"/>
+ </network-switch>
+ </network-switch>
+</cluster-topology>
\ No newline at end of file
diff --git a/genomix/genomix-pregelix/text/BridgePath/log_BridgePath b/genomix/genomix-pregelix/text/BridgePath/log_BridgePath
new file mode 100644
index 0000000..cab1281
--- /dev/null
+++ b/genomix/genomix-pregelix/text/BridgePath/log_BridgePath
@@ -0,0 +1,6 @@
+TTCCA T|C
+CCGTG CT|
+TCCAC T|CT
+CCACC T|G 9 CCACCCCGT 5
+TTTCC |A
+CCACT T|G 9 CCACTCCGT 5
diff --git a/genomix/genomix-pregelix/text/BridgePath/naive_BridgePath b/genomix/genomix-pregelix/text/BridgePath/naive_BridgePath
new file mode 100644
index 0000000..c669dba
--- /dev/null
+++ b/genomix/genomix-pregelix/text/BridgePath/naive_BridgePath
@@ -0,0 +1,6 @@
+TTCCA T|C 5 TTCCA 1
+CCGTG CT|
+TCCAC T|CT
+CCACC T|G 9 CCACCCCGT 1
+TTTCC |A
+CCACT T|C
diff --git a/genomix/genomix-pregelix/text/CyclePath/log_CyclePath b/genomix/genomix-pregelix/text/CyclePath/log_CyclePath
new file mode 100644
index 0000000..d0ee84f
--- /dev/null
+++ b/genomix/genomix-pregelix/text/CyclePath/log_CyclePath
@@ -0,0 +1,3 @@
+GCAAC |T
+AACTT C|T 12 AACTTCATCAAC 5
+CAACT GT|T
diff --git a/genomix/genomix-pregelix/text/CyclePath/naive_CyclePath b/genomix/genomix-pregelix/text/CyclePath/naive_CyclePath
new file mode 100644
index 0000000..5cdb7c2
--- /dev/null
+++ b/genomix/genomix-pregelix/text/CyclePath/naive_CyclePath
@@ -0,0 +1,3 @@
+GCAAC |T
+AACTT C|T 12 AACTTCATCAAC 1
+CAACT GT|T
diff --git a/genomix/genomix-pregelix/text/LongPath/log_LongPath b/genomix/genomix-pregelix/text/LongPath/log_LongPath
new file mode 100644
index 0000000..98cb21e
--- /dev/null
+++ b/genomix/genomix-pregelix/text/LongPath/log_LongPath
@@ -0,0 +1,3 @@
+GCCTC G|G 15 GCCTCAGTACGCCCG 5
+CCCGG G|
+GGCCT |C
diff --git a/genomix/genomix-pregelix/text/LongPath/naive_LongPath b/genomix/genomix-pregelix/text/LongPath/naive_LongPath
new file mode 100644
index 0000000..c1a472e
--- /dev/null
+++ b/genomix/genomix-pregelix/text/LongPath/naive_LongPath
@@ -0,0 +1,3 @@
+GCCTC G|G 15 GCCTCAGTACGCCCG 1
+CCCGG G|
+GGCCT |C
diff --git a/genomix/genomix-pregelix/text/Path/log_Path b/genomix/genomix-pregelix/text/Path/log_Path
new file mode 100644
index 0000000..3a46528
--- /dev/null
+++ b/genomix/genomix-pregelix/text/Path/log_Path
@@ -0,0 +1,3 @@
+GCCTC G|G 10 GCCTCAGTAC 5
+GGCCT |C
+GTACG A|
diff --git a/genomix/genomix-pregelix/text/Path/naive_Path b/genomix/genomix-pregelix/text/Path/naive_Path
new file mode 100644
index 0000000..b8d2aeb
--- /dev/null
+++ b/genomix/genomix-pregelix/text/Path/naive_Path
@@ -0,0 +1,3 @@
+GCCTC G|G 10 GCCTCAGTAC 1
+GGCCT |C
+GTACG A|
diff --git a/genomix/genomix-pregelix/text/SimplePath/log_SimplePath b/genomix/genomix-pregelix/text/SimplePath/log_SimplePath
new file mode 100644
index 0000000..5c149ac
--- /dev/null
+++ b/genomix/genomix-pregelix/text/SimplePath/log_SimplePath
@@ -0,0 +1,9 @@
+CGGCA G|A 8 CGGCAAGA 5
+AGCAC C|
+AAGAC |A
+GCGGC |A
+GCATC C|
+ATATC |G
+TATCG A|C 8 TATCGCAT 5
+AAGAA C|
+AGACA A|C 8 AGACAGCA 5
diff --git a/genomix/genomix-pregelix/text/SimplePath/naive_SimplePath b/genomix/genomix-pregelix/text/SimplePath/naive_SimplePath
new file mode 100644
index 0000000..cf53cc8
--- /dev/null
+++ b/genomix/genomix-pregelix/text/SimplePath/naive_SimplePath
@@ -0,0 +1,9 @@
+CGGCA G|A 8 CGGCAAGA 1
+AGCAC C|
+AAGAC |A
+GCGGC |A
+GCATC C|
+ATATC |G
+TATCG A|C 8 TATCGCAT 1
+AAGAA C|
+AGACA A|C 8 AGACAGCA 1
diff --git a/genomix/genomix-pregelix/text/SinglePath/log_SinglePath b/genomix/genomix-pregelix/text/SinglePath/log_SinglePath
new file mode 100644
index 0000000..f1371ec
--- /dev/null
+++ b/genomix/genomix-pregelix/text/SinglePath/log_SinglePath
@@ -0,0 +1,3 @@
+ACAGT A|
+GACAA A|T 8 GACAACAG 5
+AGACA |A
diff --git a/genomix/genomix-pregelix/text/SinglePath/naive_SinglePath b/genomix/genomix-pregelix/text/SinglePath/naive_SinglePath
new file mode 100644
index 0000000..b736667
--- /dev/null
+++ b/genomix/genomix-pregelix/text/SinglePath/naive_SinglePath
@@ -0,0 +1,3 @@
+ACAGT A|
+GACAA A|T 8 GACAACAG 1
+AGACA |A
diff --git a/genomix/genomix-pregelix/text/TreePath/log_TreePath b/genomix/genomix-pregelix/text/TreePath/log_TreePath
new file mode 100644
index 0000000..0b9f198
--- /dev/null
+++ b/genomix/genomix-pregelix/text/TreePath/log_TreePath
@@ -0,0 +1,9 @@
+CAGTA T|AC
+AGTAC C|G 10 AGTACGCCCG 5
+ATCCC T|
+GCCTC G|A 8 GCCTCAGT 5
+CCCGG G|
+GGCCT |CG
+AGTAA C|C 10 AGTAACTAAA 5
+TAAAC C|
+GCCTG G|C 12 GCCTGGCTATCC 5
diff --git a/genomix/genomix-pregelix/text/TreePath/naive_TreePath b/genomix/genomix-pregelix/text/TreePath/naive_TreePath
new file mode 100644
index 0000000..39dcbaa
--- /dev/null
+++ b/genomix/genomix-pregelix/text/TreePath/naive_TreePath
@@ -0,0 +1,9 @@
+CAGTA T|AC
+AGTAC C|G 10 AGTACGCCCG 1
+ATCCC T|
+GCCTC G|A 8 GCCTCAGT 1
+CCCGG G|
+GGCCT |CG
+AGTAA C|C 10 AGTAACTAAA 1
+TAAAC C|
+GCCTG G|C 12 GCCTGGCTATCC 1
diff --git a/genomix/pom.xml b/genomix/pom.xml
new file mode 100644
index 0000000..7876c8e
--- /dev/null
+++ b/genomix/pom.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>genomix</artifactId>
+ <version>0.2.6-SNAPSHOT</version>
+ <packaging>pom</packaging>
+ <name>genomix</name>
+
+ <distributionManagement>
+ <repository>
+ <id>hyracks-releases</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/repositories/hyracks-releases/</url>
+ </repository>
+ <snapshotRepository>
+ <id>hyracks-snapshots</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/repositories/hyracks-snapshots/</url>
+ </snapshotRepository>
+ </distributionManagement>
+
+ <repositories>
+ <repository>
+ <id>hyracks-public</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/groups/hyracks-public/</url>
+ </repository>
+ <repository>
+ <id>jboss-public</id>
+ <url>https://repository.jboss.org/nexus/content/groups/public/</url>
+ </repository>
+ </repositories>
+
+ <pluginRepositories>
+ <pluginRepository>
+ <id>hyracks-public</id>
+ <url>http://obelix.ics.uci.edu/nexus/content/groups/hyracks-public/</url>
+ <releases>
+ <updatePolicy>always</updatePolicy>
+ </releases>
+ </pluginRepository>
+ </pluginRepositories>
+
+ <modules>
+ <module>genomix-data</module>
+ <module>genomix-hyracks</module>
+ <module>genomix-hadoop</module>
+ <module>genomix-pregelix</module>
+ </modules>
+</project>
diff --git a/hyracks/hyracks-control/hyracks-control-cc/src/main/java/edu/uci/ics/hyracks/control/cc/work/TaskFailureWork.java b/hyracks/hyracks-control/hyracks-control-cc/src/main/java/edu/uci/ics/hyracks/control/cc/work/TaskFailureWork.java
index bc8c314..b3550bf 100644
--- a/hyracks/hyracks-control/hyracks-control-cc/src/main/java/edu/uci/ics/hyracks/control/cc/work/TaskFailureWork.java
+++ b/hyracks/hyracks-control/hyracks-control-cc/src/main/java/edu/uci/ics/hyracks/control/cc/work/TaskFailureWork.java
@@ -32,7 +32,7 @@
@Override
protected void performEvent(TaskAttempt ta) {
JobRun run = ccs.getActiveRunMap().get(jobId);
- ccs.getDatasetDirectoryService().reportJobFailure(jobId);
+ //ccs.getDatasetDirectoryService().reportJobFailure(jobId);
ActivityCluster ac = ta.getTask().getTaskCluster().getActivityCluster();
run.getScheduler().notifyTaskFailure(ta, ac, details);
}
diff --git a/hyracks/hyracks-control/hyracks-control-nc/src/main/java/edu/uci/ics/hyracks/control/nc/partitions/MaterializingPipelinedPartition.java b/hyracks/hyracks-control/hyracks-control-nc/src/main/java/edu/uci/ics/hyracks/control/nc/partitions/MaterializingPipelinedPartition.java
index 62320c5..1cd98fb 100644
--- a/hyracks/hyracks-control/hyracks-control-nc/src/main/java/edu/uci/ics/hyracks/control/nc/partitions/MaterializingPipelinedPartition.java
+++ b/hyracks/hyracks-control/hyracks-control-nc/src/main/java/edu/uci/ics/hyracks/control/nc/partitions/MaterializingPipelinedPartition.java
@@ -139,7 +139,7 @@
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("open(" + pid + " by " + taId);
}
- fRef = manager.getFileFactory().createUnmanagedWorkspaceFile(pid.toString());
+ fRef = manager.getFileFactory().createUnmanagedWorkspaceFile(pid.toString().replace(':', '_'));
handle = ctx.getIOManager().open(fRef, IIOManager.FileReadWriteMode.READ_WRITE,
IIOManager.FileSyncMode.METADATA_ASYNC_DATA_ASYNC);
size = 0;
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/GroupRunMergingFrameReader.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/GroupRunMergingFrameReader.java
new file mode 100644
index 0000000..d63609e
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/GroupRunMergingFrameReader.java
@@ -0,0 +1,377 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameReader;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+
+public class GroupRunMergingFrameReader implements IFrameReader {
+
+ private static final int INT_SIZE = 4;
+
+ private final IHyracksTaskContext ctx;
+ private final IFrameReader[] runCursors;
+ private final List<ByteBuffer> inFrames;
+ private final int[] keyFields;
+ private final int framesLimit;
+ private final int tableSize;
+ private final IBinaryComparator[] comparators;
+ private final RecordDescriptor recordDesc;
+ private final FrameTupleAppender outFrameAppender;
+ private final ITuplePartitionComputer tpc;
+ private ReferencedPriorityQueue topTuples;
+ private int[] tupleIndexes;
+ private int[] currentFrameIndexForRuns, bufferedFramesForRuns;
+ private FrameTupleAccessor[] tupleAccessors;
+ private int framesBuffered;
+
+ private final IAggregatorDescriptor grouper;
+ private final AggregateState groupState;
+
+ private final boolean isLoadBuffered;
+
+ private final boolean isFinalPhase;
+
+ private final ArrayTupleBuilder groupTupleBuilder, outputTupleBuilder;
+
+ private byte[] groupResultCache;
+ private ByteBuffer groupResultCacheBuffer;
+ private IFrameTupleAccessor groupResultCacheAccessor;
+ private FrameTupleAppender groupResultCacheAppender;
+
+ // FIXME
+ long queueCompCounter = 0, mergeCompCounter = 0;
+
+ public GroupRunMergingFrameReader(IHyracksTaskContext ctx, IFrameReader[] runCursors, int framesLimit,
+ int tableSize, int[] keyFields, ITuplePartitionComputer tpc, IBinaryComparator[] comparators,
+ IAggregatorDescriptor grouper, RecordDescriptor recordDesc, boolean isFinalPhase) {
+ this(ctx, runCursors, framesLimit, tableSize, keyFields, tpc, comparators, grouper, recordDesc, isFinalPhase,
+ false);
+ }
+
+ public GroupRunMergingFrameReader(IHyracksTaskContext ctx, IFrameReader[] runCursors, int framesLimit,
+ int tableSize, int[] keyFields, ITuplePartitionComputer tpc, IBinaryComparator[] comparators,
+ IAggregatorDescriptor grouper, RecordDescriptor recordDesc, boolean isFinalPhase, boolean isLoadBuffered) {
+ this.ctx = ctx;
+ this.runCursors = runCursors;
+ this.inFrames = new ArrayList<ByteBuffer>();
+ this.keyFields = keyFields;
+ this.tableSize = tableSize;
+ this.comparators = comparators;
+ this.recordDesc = recordDesc;
+ this.grouper = grouper;
+ this.groupState = grouper.createAggregateStates();
+ this.outFrameAppender = new FrameTupleAppender(ctx.getFrameSize());
+ this.isLoadBuffered = isLoadBuffered;
+ this.isFinalPhase = isFinalPhase;
+ this.framesLimit = framesLimit;
+ this.tpc = tpc;
+
+ this.groupTupleBuilder = new ArrayTupleBuilder(recordDesc.getFieldCount());
+ this.outputTupleBuilder = new ArrayTupleBuilder(recordDesc.getFieldCount());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameReader#open()
+ */
+ @Override
+ public void open() throws HyracksDataException {
+ if (isLoadBuffered) {
+ while (inFrames.size() + 1 < framesLimit) {
+ inFrames.add(ctx.allocateFrame());
+ }
+ framesBuffered = inFrames.size() / runCursors.length;
+ } else {
+ while (inFrames.size() < framesLimit - 1 && inFrames.size() < runCursors.length) {
+ inFrames.add(ctx.allocateFrame());
+ }
+ framesBuffered = 1;
+ }
+ tupleAccessors = new FrameTupleAccessor[runCursors.length];
+ currentFrameIndexForRuns = new int[runCursors.length];
+ bufferedFramesForRuns = new int[runCursors.length];
+ Comparator<ReferenceEntryWithBucketID> comparator = createEntryComparator(comparators);
+ topTuples = new ReferencedPriorityQueue(ctx.getFrameSize(), recordDesc, runCursors.length, comparator);
+ tupleIndexes = new int[runCursors.length];
+
+ for (int i = 0; i < runCursors.length; i++) {
+ int runIndex = topTuples.peek().getRunid();
+ tupleIndexes[runIndex] = 0;
+ runCursors[runIndex].open();
+ for (int j = 0; j < framesBuffered; j++) {
+
+ if (runCursors[runIndex].nextFrame(inFrames.get(runIndex * framesBuffered + j))) {
+
+ bufferedFramesForRuns[runIndex]++;
+ if (j == 0) {
+ tupleAccessors[runIndex] = new FrameTupleAccessor(ctx.getFrameSize(), recordDesc);
+ tupleAccessors[runIndex].reset(inFrames.get(runIndex * framesBuffered + j));
+ setNextTopTuple(runIndex, tupleIndexes, runCursors, tupleAccessors, topTuples);
+ currentFrameIndexForRuns[runIndex] = runIndex * framesBuffered;
+ }
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameReader#nextFrame(java.nio.ByteBuffer)
+ */
+ @Override
+ public boolean nextFrame(ByteBuffer buffer) throws HyracksDataException {
+ outFrameAppender.reset(buffer, true);
+
+ while (!topTuples.areRunsExhausted()) {
+ ReferenceEntryWithBucketID top = topTuples.peek();
+ int runIndex = top.getRunid();
+ FrameTupleAccessor fta = top.getAccessor();
+ int tupleIndex = top.getTupleIndex();
+
+ // check whether we can do aggregation
+ boolean needInsert = true;
+ if (groupResultCache != null && groupResultCacheAccessor.getTupleCount() > 0) {
+ groupResultCacheAccessor.reset(ByteBuffer.wrap(groupResultCache));
+ if (compareFrameTuples(fta, tupleIndex, groupResultCacheAccessor, 0) == 0) {
+ needInsert = false;
+ }
+ }
+
+ if (needInsert) {
+
+ // try to flush the group cache into the output buffer, if any
+ if (groupResultCacheAccessor != null && groupResultCacheAccessor.getFieldCount() > 0) {
+ outputTupleBuilder.reset();
+ for (int k = 0; k < keyFields.length; k++) {
+ outputTupleBuilder.addField(groupResultCacheAccessor, 0, k);
+ }
+ if (isFinalPhase) {
+ grouper.outputFinalResult(outputTupleBuilder, groupResultCacheAccessor, 0, groupState);
+ } else {
+ grouper.outputPartialResult(outputTupleBuilder, groupResultCacheAccessor, 0, groupState);
+ }
+
+ // return if the buffer is full
+ if (!outFrameAppender.append(outputTupleBuilder.getFieldEndOffsets(),
+ outputTupleBuilder.getByteArray(), 0, outputTupleBuilder.getSize())) {
+ return true;
+ }
+ groupResultCacheBuffer.putInt(groupResultCache.length - 4, 0);
+ }
+
+ groupTupleBuilder.reset();
+ for (int k : keyFields) {
+ groupTupleBuilder.addField(fta, tupleIndex, k);
+ }
+ grouper.init(groupTupleBuilder, fta, tupleIndex, groupState);
+
+ // enlarge the cache buffer if necessary
+ int requiredSize = groupTupleBuilder.getSize() + groupTupleBuilder.getFieldEndOffsets().length
+ * INT_SIZE + 2 * INT_SIZE;
+
+ if (groupResultCache == null || groupResultCache.length < requiredSize) {
+ groupResultCache = new byte[requiredSize];
+ groupResultCacheAppender = new FrameTupleAppender(groupResultCache.length);
+ groupResultCacheBuffer = ByteBuffer.wrap(groupResultCache);
+ groupResultCacheAccessor = new FrameTupleAccessor(groupResultCache.length, recordDesc);
+ }
+
+ // always reset the group cache
+ groupResultCacheAppender.reset(groupResultCacheBuffer, true);
+ if (!groupResultCacheAppender.append(groupTupleBuilder.getFieldEndOffsets(),
+ groupTupleBuilder.getByteArray(), 0, groupTupleBuilder.getSize())) {
+ throw new HyracksDataException("The partial result is too large to be initialized in a frame.");
+ }
+
+ groupResultCacheAccessor.reset(groupResultCacheBuffer);
+
+ } else {
+ grouper.aggregate(fta, tupleIndex, groupResultCacheAccessor, 0, groupState);
+ }
+
+ ++tupleIndexes[runIndex];
+ setNextTopTuple(runIndex, tupleIndexes, runCursors, tupleAccessors, topTuples);
+ }
+
+ if (groupResultCacheAccessor != null && groupResultCacheAccessor.getTupleCount() > 0) {
+ outputTupleBuilder.reset();
+ for (int k = 0; k < keyFields.length; k++) {
+ outputTupleBuilder.addField(groupResultCacheAccessor, 0, k);
+ }
+ if (isFinalPhase) {
+ grouper.outputFinalResult(outputTupleBuilder, groupResultCacheAccessor, 0, groupState);
+ } else {
+ grouper.outputPartialResult(outputTupleBuilder, groupResultCacheAccessor, 0, groupState);
+ }
+
+ // return if the buffer is full
+ if (!outFrameAppender.append(outputTupleBuilder.getFieldEndOffsets(), outputTupleBuilder.getByteArray(), 0,
+ outputTupleBuilder.getSize())) {
+ return true;
+ }
+
+ groupResultCacheAccessor = null;
+ groupResultCache = null;
+ groupResultCacheBuffer = null;
+ groupResultCacheAppender = null;
+ }
+
+ if (outFrameAppender.getTupleCount() > 0) {
+ return true;
+ }
+
+ return false;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameReader#close()
+ */
+ @Override
+ public void close() throws HyracksDataException {
+ for (int i = 0; i < runCursors.length; ++i) {
+ closeRun(i, runCursors, tupleAccessors);
+ }
+ }
+
+ private void setNextTopTuple(int runIndex, int[] tupleIndexes, IFrameReader[] runCursors,
+ FrameTupleAccessor[] tupleAccessors, ReferencedPriorityQueue topTuples) throws HyracksDataException {
+ boolean exists = hasNextTuple(runIndex, tupleIndexes, runCursors, tupleAccessors);
+ if (exists) {
+ int h = tpc.partition(tupleAccessors[runIndex], tupleIndexes[runIndex], tableSize);
+ topTuples.popAndReplace(tupleAccessors[runIndex], tupleIndexes[runIndex], h);
+ } else {
+ topTuples.pop();
+ closeRun(runIndex, runCursors, tupleAccessors);
+ }
+ }
+
+ private boolean hasNextTuple(int runIndex, int[] tupleIndexes, IFrameReader[] runCursors,
+ FrameTupleAccessor[] tupleAccessors) throws HyracksDataException {
+ if (tupleAccessors[runIndex] == null || runCursors[runIndex] == null) {
+ return false;
+ } else if (tupleIndexes[runIndex] >= tupleAccessors[runIndex].getTupleCount()) {
+ if (currentFrameIndexForRuns[runIndex] - runIndex * framesBuffered < bufferedFramesForRuns[runIndex] - 1) {
+ currentFrameIndexForRuns[runIndex]++;
+ } else {
+ bufferedFramesForRuns[runIndex] = 0;
+ for (int j = 0; j < framesBuffered; j++) {
+ if (runCursors[runIndex].nextFrame(inFrames.get(runIndex * framesBuffered + j))) {
+ bufferedFramesForRuns[runIndex]++;
+ } else {
+ break;
+ }
+ }
+ currentFrameIndexForRuns[runIndex] = runIndex * framesBuffered;
+ }
+ if (bufferedFramesForRuns[runIndex] > 0) {
+ tupleAccessors[runIndex].reset(inFrames.get(currentFrameIndexForRuns[runIndex]));
+ tupleIndexes[runIndex] = 0;
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ return true;
+ }
+ }
+
+ private void closeRun(int index, IFrameReader[] runCursors, IFrameTupleAccessor[] tupleAccessors)
+ throws HyracksDataException {
+ if (runCursors[index] != null) {
+ runCursors[index].close();
+ runCursors[index] = null;
+ tupleAccessors[index] = null;
+ }
+ }
+
+ private int compareFrameTuples(IFrameTupleAccessor fta1, int j1, IFrameTupleAccessor fta2, int j2) {
+ mergeCompCounter++;
+ byte[] b1 = fta1.getBuffer().array();
+ byte[] b2 = fta2.getBuffer().array();
+ for (int f = 0; f < keyFields.length; ++f) {
+ int fIdx = f;
+ int s1 = fta1.getTupleStartOffset(j1) + fta1.getFieldSlotsLength() + fta1.getFieldStartOffset(j1, fIdx);
+ int l1 = fta1.getFieldLength(j1, fIdx);
+ int s2 = fta2.getTupleStartOffset(j2) + fta2.getFieldSlotsLength() + fta2.getFieldStartOffset(j2, fIdx);
+ int l2_start = fta2.getFieldStartOffset(j2, fIdx);
+ int l2_end = fta2.getFieldEndOffset(j2, fIdx);
+ int l2 = l2_end - l2_start;
+ int c = comparators[f].compare(b1, s1, l1, b2, s2, l2);
+ if (c != 0) {
+ return c;
+ }
+ }
+ return 0;
+ }
+
+ private Comparator<ReferenceEntryWithBucketID> createEntryComparator(final IBinaryComparator[] comparators) {
+ return new Comparator<ReferenceEntryWithBucketID>() {
+ public int compare(ReferenceEntryWithBucketID tp1, ReferenceEntryWithBucketID tp2) {
+
+ queueCompCounter++;
+
+ int cmp = tp1.getBucketID() - tp2.getBucketID();
+
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ FrameTupleAccessor fta1 = (FrameTupleAccessor) tp1.getAccessor();
+ FrameTupleAccessor fta2 = (FrameTupleAccessor) tp2.getAccessor();
+ int j1 = tp1.getTupleIndex();
+ int j2 = tp2.getTupleIndex();
+ byte[] b1 = fta1.getBuffer().array();
+ byte[] b2 = fta2.getBuffer().array();
+ for (int f = 0; f < keyFields.length; ++f) {
+ int fIdx = keyFields[f];
+ int s1 = fta1.getTupleStartOffset(j1) + fta1.getFieldSlotsLength()
+ + fta1.getFieldStartOffset(j1, fIdx);
+ int l1 = fta1.getFieldEndOffset(j1, fIdx) - fta1.getFieldStartOffset(j1, fIdx);
+ int s2 = fta2.getTupleStartOffset(j2) + fta2.getFieldSlotsLength()
+ + fta2.getFieldStartOffset(j2, fIdx);
+ int l2 = fta2.getFieldEndOffset(j2, fIdx) - fta2.getFieldStartOffset(j2, fIdx);
+ int c = comparators[f].compare(b1, s1, l1, b2, s2, l2);
+ if (c != 0) {
+ return c;
+ }
+ }
+
+ return cmp;
+ }
+ };
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGroupHashTable.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGroupHashTable.java
new file mode 100644
index 0000000..6e85cff
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGroupHashTable.java
@@ -0,0 +1,686 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.LinkedList;
+
+import edu.uci.ics.hyracks.api.comm.FrameHelper;
+import edu.uci.ics.hyracks.api.comm.IFrameWriter;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileReader;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileWriter;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.group.hybridhash.FrameTupleAccessorForGroupHashtable;
+import edu.uci.ics.hyracks.dataflow.std.group.hybridhash.FrameTupleAppenderForGroupHashtable;
+import edu.uci.ics.hyracks.dataflow.std.structures.TuplePointer;
+
+public class HybridHashSortGroupHashTable {
+
+ protected static final int INT_SIZE = 4;
+ protected static final int INIT_REF_COUNT = 8;
+ protected static final int PTR_SIZE = 3;
+
+ protected final int tableSize, framesLimit, frameSize;
+
+ protected final ByteBuffer[] headers;
+ protected final ByteBuffer[] contents;
+
+ protected final IHyracksTaskContext ctx;
+
+ protected int currentLargestFrameIndex;
+ protected int totalTupleCount;
+
+ protected final IAggregatorDescriptor aggregator;
+ protected final AggregateState aggState;
+
+ protected final int[] keys, internalKeys;
+
+ private final IBinaryComparator[] comparators;
+
+ protected final ITuplePartitionComputer tpc;
+
+ protected final INormalizedKeyComputer firstNormalizer;
+
+ private ByteBuffer outputBuffer;
+
+ private LinkedList<RunFileReader> runReaders;
+
+ protected TuplePointer matchPointer;
+
+ protected final FrameTupleAccessorForGroupHashtable hashtableRecordAccessor;
+
+ private final FrameTupleAccessorForGroupHashtable compFrameAccessor1, compFrameAccessor2;
+
+ protected final FrameTupleAppenderForGroupHashtable internalAppender;
+
+ private final FrameTupleAppender outputAppender;
+
+ /**
+ * Tuple builder for hash table insertion
+ */
+ protected final ArrayTupleBuilder internalTupleBuilder, outputTupleBuilder;
+
+ /**
+ * pointers for sort records in an entry
+ */
+ protected int[] tPointers;
+
+ protected int usedEntries = 0;
+
+ protected long hashedKeys = 0, hashedRawRec = 0;
+
+ public HybridHashSortGroupHashTable(IHyracksTaskContext ctx, int frameLimits, int tableSize, int[] keys,
+ IBinaryComparator[] comparators, ITuplePartitionComputer tpc,
+ INormalizedKeyComputer firstNormalizerComputer, IAggregatorDescriptor aggregator,
+ RecordDescriptor inRecDesc, RecordDescriptor outRecDesc) {
+ this.ctx = ctx;
+ this.tableSize = tableSize;
+ this.framesLimit = frameLimits;
+ this.frameSize = ctx.getFrameSize();
+
+ this.keys = keys;
+ this.internalKeys = new int[keys.length];
+ for (int i = 0; i < internalKeys.length; i++) {
+ internalKeys[i] = i;
+ }
+
+ this.aggregator = aggregator;
+ this.aggState = aggregator.createAggregateStates();
+
+ this.tpc = tpc;
+ this.comparators = comparators;
+ this.firstNormalizer = firstNormalizerComputer;
+
+ // initialize the hash table
+ int residual = ((tableSize % frameSize) * INT_SIZE * 2) % frameSize == 0 ? 0 : 1;
+ this.headers = new ByteBuffer[tableSize / frameSize * INT_SIZE * 2 + tableSize % frameSize * 2 * INT_SIZE
+ / frameSize + residual];
+
+ this.outputBuffer = ctx.allocateFrame();
+
+ this.contents = new ByteBuffer[framesLimit - 1 - headers.length];
+ this.currentLargestFrameIndex = -1;
+ this.totalTupleCount = 0;
+
+ this.runReaders = new LinkedList<RunFileReader>();
+ this.hashtableRecordAccessor = new FrameTupleAccessorForGroupHashtable(frameSize, outRecDesc);
+ this.compFrameAccessor1 = new FrameTupleAccessorForGroupHashtable(frameSize, outRecDesc);
+ this.compFrameAccessor2 = new FrameTupleAccessorForGroupHashtable(frameSize, outRecDesc);
+
+ this.internalTupleBuilder = new ArrayTupleBuilder(outRecDesc.getFieldCount());
+ this.outputTupleBuilder = new ArrayTupleBuilder(outRecDesc.getFieldCount());
+ this.internalAppender = new FrameTupleAppenderForGroupHashtable(frameSize);
+ this.outputAppender = new FrameTupleAppender(frameSize);
+
+ this.matchPointer = new TuplePointer();
+
+ }
+
+ /**
+ * Reset the header page
+ *
+ * @param headerFrameIndex
+ */
+ protected void resetHeader(int headerFrameIndex) {
+ for (int i = 0; i < frameSize; i += INT_SIZE) {
+ headers[headerFrameIndex].putInt(i, -1);
+ }
+ }
+
+ /**
+ * Get the header frame index of the given hash table entry
+ *
+ * @param entry
+ * @return
+ */
+ protected int getHeaderFrameIndex(int entry) {
+ int frameIndex = entry / frameSize * 2 * INT_SIZE + entry % frameSize * 2 * INT_SIZE / frameSize;
+ return frameIndex;
+ }
+
+ /**
+ * Get the tuple index of the given hash table entry
+ *
+ * @param entry
+ * @return
+ */
+ protected int getHeaderTupleIndex(int entry) {
+ int offset = entry % frameSize * 2 * INT_SIZE % frameSize;
+ return offset;
+ }
+
+ public void insert(FrameTupleAccessor accessor, int tupleIndex) throws HyracksDataException {
+
+ int entry = tpc.partition(accessor, tupleIndex, tableSize);
+
+ hashedRawRec++;
+
+ if (findMatch(entry, accessor, tupleIndex)) {
+ // find match; do aggregation
+ hashtableRecordAccessor.reset(contents[matchPointer.frameIndex]);
+ aggregator.aggregate(accessor, tupleIndex, hashtableRecordAccessor, matchPointer.tupleIndex, aggState);
+ } else {
+
+ internalTupleBuilder.reset();
+ for (int k = 0; k < keys.length; k++) {
+ internalTupleBuilder.addField(accessor, tupleIndex, keys[k]);
+ }
+ aggregator.init(internalTupleBuilder, accessor, tupleIndex, aggState);
+ int insertFrameIndex = -1, insertTupleIndex = -1;
+ boolean inserted = false;
+
+ if (currentLargestFrameIndex < 0) {
+ currentLargestFrameIndex = 0;
+ }
+
+ if (contents[currentLargestFrameIndex] == null) {
+ contents[currentLargestFrameIndex] = ctx.allocateFrame();
+ }
+
+ internalAppender.reset(contents[currentLargestFrameIndex], false);
+ if (internalAppender.append(internalTupleBuilder.getFieldEndOffsets(), internalTupleBuilder.getByteArray(),
+ 0, internalTupleBuilder.getSize())) {
+ inserted = true;
+ insertFrameIndex = currentLargestFrameIndex;
+ insertTupleIndex = internalAppender.getTupleCount() - 1;
+ }
+
+ if (!inserted && currentLargestFrameIndex < contents.length - 1) {
+ currentLargestFrameIndex++;
+ if (contents[currentLargestFrameIndex] == null) {
+ contents[currentLargestFrameIndex] = ctx.allocateFrame();
+ }
+ internalAppender.reset(contents[currentLargestFrameIndex], true);
+ if (!internalAppender.append(internalTupleBuilder.getFieldEndOffsets(),
+ internalTupleBuilder.getByteArray(), 0, internalTupleBuilder.getSize())) {
+ throw new HyracksDataException("Failed to insert an aggregation value.");
+ } else {
+ insertFrameIndex = currentLargestFrameIndex;
+ insertTupleIndex = internalAppender.getTupleCount() - 1;
+ inserted = true;
+ }
+ }
+
+ // memory is full
+ if (!inserted) {
+ // flush hash table and try to insert again
+ flush();
+
+ // update the match point to the header reference
+ matchPointer.frameIndex = -1;
+ matchPointer.tupleIndex = -1;
+ // re-insert
+ currentLargestFrameIndex++;
+ if (contents[currentLargestFrameIndex] == null) {
+ contents[currentLargestFrameIndex] = ctx.allocateFrame();
+ }
+ internalAppender.reset(contents[currentLargestFrameIndex], true);
+ if (!internalAppender.append(internalTupleBuilder.getFieldEndOffsets(),
+ internalTupleBuilder.getByteArray(), 0, internalTupleBuilder.getSize())) {
+ throw new HyracksDataException("Failed to insert an aggregation value.");
+ } else {
+ insertFrameIndex = currentLargestFrameIndex;
+ insertTupleIndex = internalAppender.getTupleCount() - 1;
+ }
+ }
+
+ // no match; new insertion
+ if (matchPointer.frameIndex < 0) {
+ // first record for this entry; update the header references
+ int headerFrameIndex = getHeaderFrameIndex(entry);
+ int headerFrameOffset = getHeaderTupleIndex(entry);
+ if (headers[headerFrameIndex] == null) {
+ headers[headerFrameIndex] = ctx.allocateFrame();
+ resetHeader(headerFrameIndex);
+ }
+ headers[headerFrameIndex].putInt(headerFrameOffset, insertFrameIndex);
+ headers[headerFrameIndex].putInt(headerFrameOffset + INT_SIZE, insertTupleIndex);
+ usedEntries++;
+
+ } else {
+ // update the previous reference
+ hashtableRecordAccessor.reset(contents[matchPointer.frameIndex]);
+ int refOffset = hashtableRecordAccessor.getTupleHashReferenceOffset(matchPointer.tupleIndex);
+ contents[matchPointer.frameIndex].putInt(refOffset, insertFrameIndex);
+ contents[matchPointer.frameIndex].putInt(refOffset + INT_SIZE, insertTupleIndex);
+ }
+ hashedKeys++;
+ totalTupleCount++;
+ }
+ }
+
+ /**
+ * Flush the hash table directly to the output
+ */
+ public void flushHashtableToOutput(IFrameWriter outputWriter) throws HyracksDataException {
+
+ outputAppender.reset(outputBuffer, true);
+ for (int i = 0; i < contents.length; i++) {
+ if (contents[i] == null) {
+ continue;
+ }
+ hashtableRecordAccessor.reset(contents[i]);
+ int tupleCount = hashtableRecordAccessor.getTupleCount();
+ for (int j = 0; j < tupleCount; j++) {
+ outputTupleBuilder.reset();
+
+ int tupleOffset = hashtableRecordAccessor.getTupleStartOffset(j);
+ int fieldOffset = hashtableRecordAccessor.getFieldCount() * INT_SIZE;
+
+ for (int k = 0; k < internalKeys.length; k++) {
+ outputTupleBuilder.addField(hashtableRecordAccessor.getBuffer().array(), tupleOffset + fieldOffset
+ + hashtableRecordAccessor.getFieldStartOffset(j, k),
+ hashtableRecordAccessor.getFieldLength(j, k));
+ }
+
+ aggregator.outputFinalResult(outputTupleBuilder, hashtableRecordAccessor, j, aggState);
+
+ if (!outputAppender.append(outputTupleBuilder.getFieldEndOffsets(), outputTupleBuilder.getByteArray(),
+ 0, outputTupleBuilder.getSize())) {
+
+ FrameUtils.flushFrame(outputBuffer, outputWriter);
+
+ outputAppender.reset(outputBuffer, true);
+ if (!outputAppender.append(outputTupleBuilder.getFieldEndOffsets(),
+ outputTupleBuilder.getByteArray(), 0, outputTupleBuilder.getSize())) {
+ throw new HyracksDataException("Failed to flush the hash table to the final output");
+ }
+ }
+ }
+ }
+
+ if (outputAppender.getTupleCount() > 0) {
+
+ FrameUtils.flushFrame(outputBuffer, outputWriter);
+
+ outputAppender.reset(outputBuffer, true);
+ }
+
+ totalTupleCount = 0;
+ usedEntries = 0;
+ }
+
+ /**
+ * Flush hash table into a run file.
+ *
+ * @throws HyracksDataException
+ */
+ protected void flush() throws HyracksDataException {
+
+ long methodTimer = System.nanoTime();
+
+ FileReference runFile;
+ try {
+ runFile = ctx.getJobletContext().createManagedWorkspaceFile(
+ HybridHashSortGroupHashTable.class.getSimpleName());
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ RunFileWriter runWriter = new RunFileWriter(runFile, ctx.getIOManager());
+ runWriter.open();
+ flushEntries(runWriter);
+ runWriter.close();
+ runReaders.add(runWriter.createReader());
+ reset();
+
+ ctx.getCounterContext()
+ .getCounter("optional." + HybridHashSortGroupHashTable.class.getSimpleName() + ".flush.time", true)
+ .update(System.nanoTime() - methodTimer);
+ }
+
+ private void flushEntries(IFrameWriter writer) throws HyracksDataException {
+
+ outputAppender.reset(outputBuffer, true);
+ for (int i = 0; i < tableSize; i++) {
+ int tupleInEntry = sortEntry(i);
+
+ for (int ptr = 0; ptr < tupleInEntry; ptr++) {
+ int frameIndex = tPointers[ptr * PTR_SIZE];
+ int tupleIndex = tPointers[ptr * PTR_SIZE + 1];
+
+ hashtableRecordAccessor.reset(contents[frameIndex]);
+ outputTupleBuilder.reset();
+
+ int tupleOffset = hashtableRecordAccessor.getTupleStartOffset(tupleIndex);
+ int fieldOffset = hashtableRecordAccessor.getFieldCount() * INT_SIZE;
+
+ for (int k = 0; k < internalKeys.length; k++) {
+ outputTupleBuilder.addField(hashtableRecordAccessor.getBuffer().array(), tupleOffset + fieldOffset
+ + hashtableRecordAccessor.getFieldStartOffset(tupleIndex, k),
+ hashtableRecordAccessor.getFieldLength(tupleIndex, k));
+ }
+
+ aggregator.outputPartialResult(outputTupleBuilder, hashtableRecordAccessor, tupleIndex, aggState);
+
+ if (!outputAppender.append(outputTupleBuilder.getFieldEndOffsets(), outputTupleBuilder.getByteArray(),
+ 0, outputTupleBuilder.getSize())) {
+
+ FrameUtils.flushFrame(outputBuffer, writer);
+
+ outputAppender.reset(outputBuffer, true);
+ if (!outputAppender.append(outputTupleBuilder.getFieldEndOffsets(),
+ outputTupleBuilder.getByteArray(), 0, outputTupleBuilder.getSize())) {
+ throw new HyracksDataException("Failed to flush an aggregation result.");
+ }
+ }
+ totalTupleCount--;
+ }
+
+ if (tupleInEntry > 0) {
+ usedEntries--;
+ }
+ }
+
+ if (outputAppender.getTupleCount() > 0) {
+
+ FrameUtils.flushFrame(outputBuffer, writer);
+
+ outputAppender.reset(outputBuffer, true);
+ }
+ }
+
+ protected int sortEntry(int entryID) {
+
+ if (tPointers == null)
+ tPointers = new int[INIT_REF_COUNT * PTR_SIZE];
+ int ptr = 0;
+
+ int headerFrameIndex = entryID / frameSize * 2 * INT_SIZE + (entryID % frameSize) * 2 * INT_SIZE / frameSize;
+ int headerFrameOffset = (entryID % frameSize) * 2 * INT_SIZE % frameSize;
+
+ if (headers[headerFrameIndex] == null) {
+ return 0;
+ }
+
+ int entryFrameIndex = headers[headerFrameIndex].getInt(headerFrameOffset);
+ int entryTupleIndex = headers[headerFrameIndex].getInt(headerFrameOffset + INT_SIZE);
+
+ do {
+ if (entryFrameIndex < 0) {
+ break;
+ }
+ hashtableRecordAccessor.reset(contents[entryFrameIndex]);
+ tPointers[ptr * PTR_SIZE] = entryFrameIndex;
+ tPointers[ptr * PTR_SIZE + 1] = entryTupleIndex;
+ int tStart = hashtableRecordAccessor.getTupleStartOffset(entryTupleIndex);
+ int f0StartRel = hashtableRecordAccessor.getFieldStartOffset(entryTupleIndex, internalKeys[0]);
+ int f0EndRel = hashtableRecordAccessor.getFieldEndOffset(entryTupleIndex, internalKeys[0]);
+ int f0Start = f0StartRel + tStart + hashtableRecordAccessor.getFieldSlotsLength();
+ tPointers[ptr * PTR_SIZE + 2] = firstNormalizer == null ? 0 : firstNormalizer.normalize(
+ hashtableRecordAccessor.getBuffer().array(), f0Start, f0EndRel - f0StartRel);
+
+ ptr++;
+
+ if (ptr * PTR_SIZE >= tPointers.length) {
+ int[] newTPointers = new int[tPointers.length * 2];
+ System.arraycopy(tPointers, 0, newTPointers, 0, tPointers.length);
+ tPointers = newTPointers;
+ }
+
+ // move to the next record
+ int refOffset = hashtableRecordAccessor.getTupleHashReferenceOffset(entryTupleIndex);
+ int prevFrameIndex = entryFrameIndex;
+ entryFrameIndex = contents[prevFrameIndex].getInt(refOffset);
+ entryTupleIndex = contents[prevFrameIndex].getInt(refOffset + INT_SIZE);
+
+ } while (true);
+
+ // sort records
+ if (ptr > 1) {
+ sort(0, ptr);
+ }
+
+ return ptr;
+ }
+
+ protected void sort(int offset, int len) {
+ int m = offset + (len >> 1);
+ int mFrameIndex = tPointers[m * PTR_SIZE];
+ int mTupleIndex = tPointers[m * PTR_SIZE + 1];
+ int mNormKey = tPointers[m * PTR_SIZE + 2];
+ compFrameAccessor1.reset(contents[mFrameIndex]);
+
+ int a = offset;
+ int b = a;
+ int c = offset + len - 1;
+ int d = c;
+ while (true) {
+ while (b <= c) {
+ int bFrameIndex = tPointers[b * PTR_SIZE];
+ int bTupleIndex = tPointers[b * PTR_SIZE + 1];
+ int bNormKey = tPointers[b * PTR_SIZE + 2];
+ int cmp = 0;
+ if (bNormKey != mNormKey) {
+ cmp = ((((long) bNormKey) & 0xffffffffL) < (((long) mNormKey) & 0xffffffffL)) ? -1 : 1;
+ } else {
+ compFrameAccessor2.reset(contents[bFrameIndex]);
+ cmp = compare(compFrameAccessor2, bTupleIndex, compFrameAccessor1, mTupleIndex);
+ }
+ if (cmp > 0) {
+ break;
+ }
+ if (cmp == 0) {
+ swap(a++, b);
+ }
+ ++b;
+ }
+ while (c >= b) {
+ int cFrameIndex = tPointers[c * PTR_SIZE];
+ int cTupleIndex = tPointers[c * PTR_SIZE + 1];
+ int cNormKey = tPointers[c * PTR_SIZE + 2];
+ int cmp = 0;
+ if (cNormKey != mNormKey) {
+ cmp = ((((long) cNormKey) & 0xffffffffL) < (((long) mNormKey) & 0xffffffffL)) ? -1 : 1;
+ } else {
+ compFrameAccessor2.reset(contents[cFrameIndex]);
+ cmp = compare(compFrameAccessor2, cTupleIndex, compFrameAccessor1, mTupleIndex);
+ }
+ if (cmp < 0) {
+ break;
+ }
+ if (cmp == 0) {
+ swap(c, d--);
+ }
+ --c;
+ }
+ if (b > c)
+ break;
+ swap(b++, c--);
+ }
+
+ int s;
+ int n = offset + len;
+ s = Math.min(a - offset, b - a);
+ vecswap(offset, b - s, s);
+ s = Math.min(d - c, n - d - 1);
+ vecswap(b, n - s, s);
+
+ if ((s = b - a) > 1) {
+ sort(offset, s);
+ }
+ if ((s = d - c) > 1) {
+ sort(n - s, s);
+ }
+ }
+
+ private void swap(int a, int b) {
+ for (int i = 0; i < PTR_SIZE; i++) {
+ int t = tPointers[a * PTR_SIZE + i];
+ tPointers[a * PTR_SIZE + i] = tPointers[b * PTR_SIZE + i];
+ tPointers[b * PTR_SIZE + i] = t;
+ }
+ }
+
+ private void vecswap(int a, int b, int n) {
+ for (int i = 0; i < n; i++, a++, b++) {
+ swap(a, b);
+ }
+ }
+
+ protected boolean findMatch(int entry, FrameTupleAccessor accessor, int tupleIndex) {
+
+ // reset the match pointer
+ matchPointer.frameIndex = -1;
+ matchPointer.tupleIndex = -1;
+
+ // get reference in the header
+ int headerFrameIndex = getHeaderFrameIndex(entry);
+ int headerFrameOffset = getHeaderTupleIndex(entry);
+
+ if (headers[headerFrameIndex] == null) {
+ return false;
+ }
+
+ // initialize the pointer to the first record
+ int entryFrameIndex = headers[headerFrameIndex].getInt(headerFrameOffset);
+ int entryTupleIndex = headers[headerFrameIndex].getInt(headerFrameOffset + INT_SIZE);
+
+ while (entryFrameIndex >= 0) {
+ matchPointer.frameIndex = entryFrameIndex;
+ matchPointer.tupleIndex = entryTupleIndex;
+ hashtableRecordAccessor.reset(contents[entryFrameIndex]);
+
+ if (compare(accessor, tupleIndex, hashtableRecordAccessor, entryTupleIndex) == 0) {
+ return true;
+ }
+ // Move to the next record in this entry following the linked list
+ int refOffset = hashtableRecordAccessor.getTupleHashReferenceOffset(entryTupleIndex);
+ int prevFrameIndex = entryFrameIndex;
+ entryFrameIndex = contents[prevFrameIndex].getInt(refOffset);
+ entryTupleIndex = contents[prevFrameIndex].getInt(refOffset + INT_SIZE);
+ }
+
+ return false;
+ }
+
+ public LinkedList<RunFileReader> getRunFileReaders() {
+ return runReaders;
+ }
+
+ private int compare(FrameTupleAccessor accessor, int tupleIndex, FrameTupleAccessorForGroupHashtable hashAccessor,
+ int hashTupleIndex) {
+ int tStart0 = accessor.getTupleStartOffset(tupleIndex);
+ int fStartOffset0 = accessor.getFieldSlotsLength() + tStart0;
+
+ int tStart1 = hashAccessor.getTupleStartOffset(hashTupleIndex);
+ int fStartOffset1 = hashAccessor.getFieldSlotsLength() + tStart1;
+
+ for (int i = 0; i < keys.length; ++i) {
+ int fStart0 = accessor.getFieldStartOffset(tupleIndex, keys[i]);
+ int fEnd0 = accessor.getFieldEndOffset(tupleIndex, keys[i]);
+ int fLen0 = fEnd0 - fStart0;
+
+ int fStart1 = hashAccessor.getFieldStartOffset(hashTupleIndex, internalKeys[i]);
+ int fEnd1 = hashAccessor.getFieldEndOffset(hashTupleIndex, internalKeys[i]);
+ int fLen1 = fEnd1 - fStart1;
+
+ int c = comparators[i].compare(accessor.getBuffer().array(), fStart0 + fStartOffset0, fLen0, hashAccessor
+ .getBuffer().array(), fStart1 + fStartOffset1, fLen1);
+ if (c != 0) {
+ return c;
+ }
+ }
+ return 0;
+ }
+
+ private int compare(FrameTupleAccessorForGroupHashtable accessor1, int tupleIndex1,
+ FrameTupleAccessorForGroupHashtable accessor2, int tupleIndex2) {
+ int tStart1 = accessor1.getTupleStartOffset(tupleIndex1);
+ int fStartOffset1 = accessor1.getFieldSlotsLength() + tStart1;
+
+ int tStart2 = accessor2.getTupleStartOffset(tupleIndex2);
+ int fStartOffset2 = accessor2.getFieldSlotsLength() + tStart2;
+
+ for (int i = 0; i < internalKeys.length; ++i) {
+ int fStart1 = accessor1.getFieldStartOffset(tupleIndex1, internalKeys[i]);
+ int fEnd1 = accessor1.getFieldEndOffset(tupleIndex1, internalKeys[i]);
+ int fLen1 = fEnd1 - fStart1;
+
+ int fStart2 = accessor2.getFieldStartOffset(tupleIndex2, internalKeys[i]);
+ int fEnd2 = accessor2.getFieldEndOffset(tupleIndex2, internalKeys[i]);
+ int fLen2 = fEnd2 - fStart2;
+
+ int c = comparators[i].compare(accessor1.getBuffer().array(), fStart1 + fStartOffset1, fLen1, accessor2
+ .getBuffer().array(), fStart2 + fStartOffset2, fLen2);
+ if (c != 0) {
+ return c;
+ }
+ }
+ return 0;
+ }
+
+ public void reset() {
+ for (int i = 0; i < headers.length; i++) {
+ if (headers[i] != null) {
+ resetHeader(i);
+ }
+ }
+ for (int i = 0; i < contents.length; i++) {
+ if (contents[i] != null) {
+ contents[i].putInt(FrameHelper.getTupleCountOffset(frameSize), 0);
+ }
+ }
+
+ usedEntries = 0;
+ totalTupleCount = 0;
+ currentLargestFrameIndex = -1;
+ }
+
+ public void finishup() throws HyracksDataException {
+ if (runReaders.size() > 0) {
+ flush();
+ }
+
+ hashedKeys = 0;
+ hashedRawRec = 0;
+ }
+
+ /**
+ * Close the hash table. Note that only memory allocated by frames are freed. Aggregation
+ * states maintained in {@link #aggState} and run file readers in {@link #runReaders} should
+ * be valid for later processing.
+ */
+ public void close() throws HyracksDataException {
+ for (int i = 0; i < headers.length; i++) {
+ headers[i] = null;
+ }
+ for (int i = 0; i < contents.length; i++) {
+ contents[i] = null;
+ }
+ outputBuffer = null;
+ tPointers = null;
+ }
+
+ public int getTupleCount() {
+ return totalTupleCount;
+ }
+
+ public int getFrameSize() {
+ return headers.length + contents.length + 1;
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGroupOperatorDescriptor.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGroupOperatorDescriptor.java
new file mode 100644
index 0000000..5296c9f
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGroupOperatorDescriptor.java
@@ -0,0 +1,275 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.LinkedList;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.ActivityId;
+import edu.uci.ics.hyracks.api.dataflow.IActivityGraphBuilder;
+import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
+import edu.uci.ics.hyracks.api.dataflow.TaskId;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputerFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputerFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.job.JobId;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileReader;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractActivityNode;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractStateObject;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputSinkOperatorNodePushable;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryOutputSourceOperatorNodePushable;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+
+public class HybridHashSortGroupOperatorDescriptor extends AbstractOperatorDescriptor {
+
+ private static final int AGGREGATE_ACTIVITY_ID = 0;
+
+ private static final int MERGE_ACTIVITY_ID = 1;
+
+ private static final long serialVersionUID = 1L;
+ private final int[] keyFields, storedKeyFields;
+ private final INormalizedKeyComputerFactory firstNormalizerFactory;
+
+ private final IAggregatorDescriptorFactory aggregatorFactory;
+ private final IAggregatorDescriptorFactory mergerFactory;
+
+ private final ITuplePartitionComputerFactory aggTpcf, mergeTpcf;
+
+ private final int framesLimit;
+ private final IBinaryComparatorFactory[] comparatorFactories;
+
+ private final int tableSize;
+
+ private final boolean isLoadOptimized;
+
+ public HybridHashSortGroupOperatorDescriptor(JobSpecification spec, int[] keyFields, int framesLimit,
+ int tableSize, IBinaryComparatorFactory[] comparatorFactories, ITuplePartitionComputerFactory aggTpcf,
+ ITuplePartitionComputerFactory mergeTpcf, IAggregatorDescriptorFactory aggregatorFactory,
+ IAggregatorDescriptorFactory mergerFactory, RecordDescriptor recordDescriptor) {
+ this(spec, keyFields, framesLimit, tableSize, comparatorFactories, aggTpcf, mergeTpcf, null, aggregatorFactory,
+ mergerFactory, recordDescriptor, false);
+ }
+
+ public HybridHashSortGroupOperatorDescriptor(JobSpecification spec, int[] keyFields, int framesLimit,
+ int tableSize, IBinaryComparatorFactory[] comparatorFactories, ITuplePartitionComputerFactory aggTpcf,
+ ITuplePartitionComputerFactory mergeTpcf, INormalizedKeyComputerFactory firstNormalizerFactory,
+ IAggregatorDescriptorFactory aggregatorFactory, IAggregatorDescriptorFactory mergerFactory,
+ RecordDescriptor recordDescriptor) {
+ this(spec, keyFields, framesLimit, tableSize, comparatorFactories, aggTpcf, mergeTpcf, firstNormalizerFactory,
+ aggregatorFactory, mergerFactory, recordDescriptor, false);
+ }
+
+ public HybridHashSortGroupOperatorDescriptor(JobSpecification spec, int[] keyFields, int framesLimit,
+ int tableSize, IBinaryComparatorFactory[] comparatorFactories, ITuplePartitionComputerFactory aggTpcf,
+ ITuplePartitionComputerFactory mergeTpcf, INormalizedKeyComputerFactory firstNormalizerFactory,
+ IAggregatorDescriptorFactory aggregatorFactory, IAggregatorDescriptorFactory mergerFactory,
+ RecordDescriptor recordDescriptor, boolean isLoadOpt) {
+ super(spec, 1, 1);
+ this.framesLimit = framesLimit;
+ if (framesLimit <= 2) {
+ /**
+ * Minimum of 3 frames: 2 for in-memory hash table, and 1 for output
+ * aggregation results.
+ */
+ throw new IllegalStateException("frame limit should at least be 3, but it is " + framesLimit + "!");
+ }
+
+ storedKeyFields = new int[keyFields.length];
+ for (int i = 0; i < storedKeyFields.length; i++) {
+ storedKeyFields[i] = i;
+ }
+ this.aggregatorFactory = aggregatorFactory;
+ this.mergerFactory = mergerFactory;
+ this.keyFields = keyFields;
+ this.comparatorFactories = comparatorFactories;
+ this.firstNormalizerFactory = firstNormalizerFactory;
+ this.aggTpcf = aggTpcf;
+ this.mergeTpcf = mergeTpcf;
+ this.tableSize = tableSize;
+
+ /**
+ * Set the record descriptor. Note that since this operator is a unary
+ * operator, only the first record descriptor is used here.
+ */
+ recordDescriptors[0] = recordDescriptor;
+
+ this.isLoadOptimized = isLoadOpt;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.dataflow.IOperatorDescriptor#contributeActivities(edu.uci.ics.hyracks.api.dataflow.
+ * IActivityGraphBuilder)
+ */
+ @Override
+ public void contributeActivities(IActivityGraphBuilder builder) {
+ AggregateActivity aggregateAct = new AggregateActivity(new ActivityId(getOperatorId(), AGGREGATE_ACTIVITY_ID));
+ MergeActivity mergeAct = new MergeActivity(new ActivityId(odId, MERGE_ACTIVITY_ID));
+
+ builder.addActivity(this, aggregateAct);
+ builder.addSourceEdge(0, aggregateAct, 0);
+
+ builder.addActivity(this, mergeAct);
+ builder.addTargetEdge(0, mergeAct, 0);
+
+ builder.addBlockingEdge(aggregateAct, mergeAct);
+ }
+
+ public static class AggregateActivityState extends AbstractStateObject {
+
+ private HybridHashSortGroupHashTable gTable;
+
+ public AggregateActivityState() {
+ }
+
+ private AggregateActivityState(JobId jobId, TaskId tId) {
+ super(jobId, tId);
+ }
+
+ @Override
+ public void toBytes(DataOutput out) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void fromBytes(DataInput in) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private class AggregateActivity extends AbstractActivityNode {
+
+ private static final long serialVersionUID = 1L;
+
+ public AggregateActivity(ActivityId id) {
+ super(id);
+ }
+
+ @Override
+ public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
+ final IRecordDescriptorProvider recordDescProvider, final int partition, int nPartitions)
+ throws HyracksDataException {
+ return new AbstractUnaryInputSinkOperatorNodePushable() {
+
+ HybridHashSortGroupHashTable serializableGroupHashtable;
+
+ FrameTupleAccessor accessor;
+
+ @Override
+ public void open() throws HyracksDataException {
+
+ RecordDescriptor inRecDesc = recordDescProvider.getInputRecordDescriptor(getActivityId(), 0);
+
+ IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length];
+ for (int i = 0; i < comparatorFactories.length; i++) {
+ comparators[i] = comparatorFactories[i].createBinaryComparator();
+ }
+
+ serializableGroupHashtable = new HybridHashSortGroupHashTable(ctx, framesLimit, tableSize,
+ keyFields, comparators, aggTpcf.createPartitioner(),
+ firstNormalizerFactory.createNormalizedKeyComputer(), aggregatorFactory.createAggregator(
+ ctx, inRecDesc, recordDescriptors[0], keyFields, storedKeyFields), inRecDesc,
+ recordDescriptors[0]);
+ accessor = new FrameTupleAccessor(ctx.getFrameSize(), inRecDesc);
+ }
+
+ @Override
+ public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
+ accessor.reset(buffer);
+ int tupleCount = accessor.getTupleCount();
+ for (int i = 0; i < tupleCount; i++) {
+ serializableGroupHashtable.insert(accessor, i);
+ }
+ }
+
+ @Override
+ public void fail() throws HyracksDataException {
+ }
+
+ @Override
+ public void close() throws HyracksDataException {
+ serializableGroupHashtable.finishup();
+ AggregateActivityState state = new AggregateActivityState(ctx.getJobletContext().getJobId(),
+ new TaskId(getActivityId(), partition));
+ state.gTable = serializableGroupHashtable;
+ ctx.setStateObject(state);
+ }
+ };
+ }
+ }
+
+ private class MergeActivity extends AbstractActivityNode {
+
+ private static final long serialVersionUID = 1L;
+
+ public MergeActivity(ActivityId id) {
+ super(id);
+ }
+
+ @Override
+ public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
+ IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
+ throws HyracksDataException {
+
+ return new AbstractUnaryOutputSourceOperatorNodePushable() {
+
+ public void initialize() throws HyracksDataException {
+
+ AggregateActivityState aggState = (AggregateActivityState) ctx.getStateObject(new TaskId(
+ new ActivityId(getOperatorId(), AGGREGATE_ACTIVITY_ID), partition));
+
+ LinkedList<RunFileReader> runs = aggState.gTable.getRunFileReaders();
+
+ writer.open();
+ if (runs.size() <= 0) {
+ aggState.gTable.flushHashtableToOutput(writer);
+ aggState.gTable.close();
+ } else {
+ aggState.gTable.close();
+
+ IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length];
+ for (int i = 0; i < comparatorFactories.length; i++) {
+ comparators[i] = comparatorFactories[i].createBinaryComparator();
+ }
+
+ HybridHashSortRunMerger merger = new HybridHashSortRunMerger(ctx, runs, storedKeyFields,
+ comparators, recordDescriptors[0], mergeTpcf.createPartitioner(),
+ mergerFactory.createAggregator(ctx, recordDescriptors[0], recordDescriptors[0],
+ storedKeyFields, storedKeyFields), framesLimit, tableSize, writer,
+ isLoadOptimized);
+
+ merger.process();
+ }
+
+ writer.close();
+ }
+
+ };
+ }
+ }
+
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGrouperBucketMerge.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGrouperBucketMerge.java
new file mode 100644
index 0000000..1de2237
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortGrouperBucketMerge.java
@@ -0,0 +1,488 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.comm.IFrameWriter;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileReader;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileWriter;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.util.ReferenceEntry;
+
+public class HybridHashSortGrouperBucketMerge {
+
+ private final int[] keyFields;
+ private final IBinaryComparator[] comparators;
+
+ private final IAggregatorDescriptor merger;
+ private final AggregateState mergeState;
+
+ private final int framesLimit, tableSize;
+
+ private final RecordDescriptor inRecDesc;
+
+ private final IHyracksTaskContext ctx;
+
+ private final ArrayTupleBuilder tupleBuilder;
+
+ private final IFrameWriter outputWriter;
+
+ private final ITuplePartitionComputer tpc;
+
+ private final boolean isLoadOptimized;
+
+ List<ByteBuffer> inFrames;
+ ByteBuffer outFrame, writerFrame;
+ FrameTupleAppender outAppender, writerAppender;
+ LinkedList<RunFileReader> runs;
+ ArrayTupleBuilder finalTupleBuilder;
+ FrameTupleAccessor outFrameAccessor;
+ int[] currentFrameIndexInRun, currentRunFrames, currentBucketInRun;
+ int runFrameLimit = 1;
+
+ public HybridHashSortGrouperBucketMerge(IHyracksTaskContext ctx, int[] keyFields, int framesLimit, int tableSize,
+ ITuplePartitionComputer tpc, IBinaryComparator[] comparators, IAggregatorDescriptor merger,
+ RecordDescriptor inRecDesc, RecordDescriptor outRecDesc, IFrameWriter outputWriter)
+ throws HyracksDataException {
+ this.ctx = ctx;
+ this.framesLimit = framesLimit;
+ this.tableSize = tableSize;
+
+ this.keyFields = keyFields;
+ this.comparators = comparators;
+ this.merger = merger;
+ this.mergeState = merger.createAggregateStates();
+
+ this.inRecDesc = inRecDesc;
+
+ this.tupleBuilder = new ArrayTupleBuilder(inRecDesc.getFieldCount());
+
+ this.outAppender = new FrameTupleAppender(ctx.getFrameSize());
+
+ this.outputWriter = outputWriter;
+
+ this.outFrameAccessor = new FrameTupleAccessor(ctx.getFrameSize(), inRecDesc);
+
+ this.tpc = tpc;
+
+ this.isLoadOptimized = true;
+ }
+
+ public HybridHashSortGrouperBucketMerge(IHyracksTaskContext ctx, int[] keyFields, int framesLimit, int tableSize,
+ ITuplePartitionComputer tpc, IBinaryComparator[] comparators, IAggregatorDescriptor merger,
+ RecordDescriptor inRecDesc, RecordDescriptor outRecDesc, IFrameWriter outputWriter, boolean loadOptimized)
+ throws HyracksDataException {
+ this.ctx = ctx;
+ this.framesLimit = framesLimit;
+ this.tableSize = tableSize;
+
+ this.keyFields = keyFields;
+ this.comparators = comparators;
+ this.merger = merger;
+ this.mergeState = merger.createAggregateStates();
+
+ this.inRecDesc = inRecDesc;
+
+ this.tupleBuilder = new ArrayTupleBuilder(inRecDesc.getFieldCount());
+
+ this.outAppender = new FrameTupleAppender(ctx.getFrameSize());
+
+ this.outputWriter = outputWriter;
+
+ this.outFrameAccessor = new FrameTupleAccessor(ctx.getFrameSize(), inRecDesc);
+
+ this.tpc = tpc;
+
+ this.isLoadOptimized = loadOptimized;
+ }
+
+ public void initialize(LinkedList<RunFileReader> runFiles) throws HyracksDataException {
+
+ runs = runFiles;
+
+ try {
+ if (runs.size() <= 0) {
+ return;
+ } else {
+ inFrames = new ArrayList<ByteBuffer>();
+ outFrame = ctx.allocateFrame();
+ outAppender.reset(outFrame, true);
+ outFrameAccessor.reset(outFrame);
+ int runProcOffset = 0;
+ while (runs.size() > 0) {
+ try {
+ doPass(runs, runProcOffset);
+ if (runs.size() + 2 <= framesLimit) {
+ // final phase
+ runProcOffset = 0;
+ } else {
+ // one more merge level
+ runProcOffset++;
+ }
+ } catch (Exception e) {
+ throw new HyracksDataException(e);
+ }
+ }
+ inFrames.clear();
+ }
+ } catch (Exception e) {
+ outputWriter.fail();
+ throw new HyracksDataException(e);
+ } finally {
+ mergeState.close();
+ }
+ }
+
+ private void doPass(LinkedList<RunFileReader> runs, int offset) throws HyracksDataException {
+ FileReference newRun = null;
+ IFrameWriter writer = outputWriter;
+ boolean finalPass = false;
+
+ int runNumber = runs.size() - offset;
+
+ while (inFrames.size() + 2 < framesLimit) {
+ inFrames.add(ctx.allocateFrame());
+ }
+
+ if (runNumber + 2 <= framesLimit) {
+ finalPass = true;
+ if (isLoadOptimized)
+ runFrameLimit = (framesLimit - 2) / runNumber;
+ else
+ runFrameLimit = 1;
+ } else {
+ runFrameLimit = 1;
+ runNumber = framesLimit - 2;
+ newRun = ctx.getJobletContext().createManagedWorkspaceFile(
+ HybridHashSortGrouperBucketMerge.class.getSimpleName());
+ writer = new RunFileWriter(newRun, ctx.getIOManager());
+ writer.open();
+ }
+ try {
+ currentFrameIndexInRun = new int[runNumber];
+ currentRunFrames = new int[runNumber];
+ currentBucketInRun = new int[runNumber];
+ /**
+ * Create file readers for each input run file, only for
+ * the ones fit into the inFrames
+ */
+ RunFileReader[] runFileReaders = new RunFileReader[runNumber];
+ FrameTupleAccessor[] tupleAccessors = new FrameTupleAccessor[inFrames.size()];
+ Comparator<ReferenceHashEntry> comparator = createEntryComparator(comparators);
+ ReferencedBucketBasedPriorityQueue topTuples = new ReferencedBucketBasedPriorityQueue(ctx.getFrameSize(),
+ inRecDesc, runNumber, comparator, tpc, tableSize);
+ /**
+ * current tuple index in each run
+ */
+ int[] tupleIndices = new int[runNumber];
+
+ for (int i = 0; i < runNumber; i++) {
+ int runIndex = i + offset;
+ tupleIndices[i] = 0;
+ // Load the run file
+ runFileReaders[i] = runs.get(runIndex);
+ runFileReaders[i].open();
+
+ currentRunFrames[i] = 0;
+ currentFrameIndexInRun[i] = i * runFrameLimit;
+ for (int j = 0; j < runFrameLimit; j++) {
+ int frameIndex = currentFrameIndexInRun[i] + j;
+ boolean hasNextFrame = runFileReaders[runIndex].nextFrame(inFrames.get(frameIndex));
+ if (hasNextFrame) {
+ tupleAccessors[frameIndex] = new FrameTupleAccessor(ctx.getFrameSize(), inRecDesc);
+ tupleAccessors[frameIndex].reset(inFrames.get(frameIndex));
+ currentRunFrames[i]++;
+ if (j == 0) {
+ currentBucketInRun[i] = tpc.partition(tupleAccessors[frameIndex], tupleIndices[i],
+ tableSize);
+ setNextTopTuple(i, tupleIndices, runFileReaders, tupleAccessors, topTuples);
+ }
+ } else {
+ break;
+ }
+ }
+ }
+
+ /**
+ * Start merging
+ */
+ while (!topTuples.areRunsExhausted()) {
+ /**
+ * Get the top record
+ */
+ ReferenceEntry top = topTuples.peek();
+ int tupleIndex = top.getTupleIndex();
+ int runIndex = topTuples.peek().getRunid();
+
+ FrameTupleAccessor fta = top.getAccessor();
+
+ int currentTupleInOutFrame = outFrameAccessor.getTupleCount() - 1;
+ if (currentTupleInOutFrame < 0
+ || compareFrameTuples(fta, tupleIndex, outFrameAccessor, currentTupleInOutFrame) != 0) {
+
+ tupleBuilder.reset();
+
+ for (int k = 0; k < keyFields.length; k++) {
+ tupleBuilder.addField(fta, tupleIndex, keyFields[k]);
+ }
+
+ merger.init(tupleBuilder, fta, tupleIndex, mergeState);
+
+ if (!outAppender.appendSkipEmptyField(tupleBuilder.getFieldEndOffsets(),
+ tupleBuilder.getByteArray(), 0, tupleBuilder.getSize())) {
+ flushOutFrame(writer, finalPass);
+ if (!outAppender.appendSkipEmptyField(tupleBuilder.getFieldEndOffsets(),
+ tupleBuilder.getByteArray(), 0, tupleBuilder.getSize())) {
+ throw new HyracksDataException(
+ "The partial result is too large to be initialized in a frame.");
+ }
+ }
+
+ } else {
+ /**
+ * if new tuple is in the same group of the
+ * current aggregator do merge and output to the
+ * outFrame
+ */
+
+ merger.aggregate(fta, tupleIndex, outFrameAccessor, currentTupleInOutFrame, mergeState);
+
+ }
+ tupleIndices[runIndex]++;
+ setNextTopTuple(runIndex, tupleIndices, runFileReaders, tupleAccessors, topTuples);
+ }
+
+ if (outAppender.getTupleCount() > 0) {
+ flushOutFrame(writer, finalPass);
+ outAppender.reset(outFrame, true);
+ }
+
+ merger.close();
+
+ runs.subList(offset, runNumber).clear();
+ /**
+ * insert the new run file into the beginning of the run
+ * file list
+ */
+ if (!finalPass) {
+ runs.add(offset, ((RunFileWriter) writer).createReader());
+ }
+ } finally {
+ if (!finalPass) {
+ writer.close();
+ }
+ mergeState.reset();
+ }
+ }
+
+ private void flushOutFrame(IFrameWriter writer, boolean isFinal) throws HyracksDataException {
+
+ if (finalTupleBuilder == null) {
+ finalTupleBuilder = new ArrayTupleBuilder(inRecDesc.getFields().length);
+ }
+
+ if (writerFrame == null) {
+ writerFrame = ctx.allocateFrame();
+ }
+
+ if (writerAppender == null) {
+ writerAppender = new FrameTupleAppender(ctx.getFrameSize());
+ }
+ writerAppender.reset(writerFrame, true);
+
+ outFrameAccessor.reset(outFrame);
+
+ for (int i = 0; i < outFrameAccessor.getTupleCount(); i++) {
+
+ finalTupleBuilder.reset();
+
+ for (int k = 0; k < keyFields.length; k++) {
+ finalTupleBuilder.addField(outFrameAccessor, i, keyFields[k]);
+ }
+
+ if (isFinal) {
+
+ merger.outputFinalResult(finalTupleBuilder, outFrameAccessor, i, mergeState);
+
+ } else {
+
+ merger.outputPartialResult(finalTupleBuilder, outFrameAccessor, i, mergeState);
+ }
+
+ if (!writerAppender.appendSkipEmptyField(finalTupleBuilder.getFieldEndOffsets(),
+ finalTupleBuilder.getByteArray(), 0, finalTupleBuilder.getSize())) {
+ FrameUtils.flushFrame(writerFrame, writer);
+ writerAppender.reset(writerFrame, true);
+ if (!writerAppender.appendSkipEmptyField(finalTupleBuilder.getFieldEndOffsets(),
+ finalTupleBuilder.getByteArray(), 0, finalTupleBuilder.getSize())) {
+ throw new HyracksDataException("Aggregation output is too large to be fit into a frame.");
+ }
+ }
+ }
+ if (writerAppender.getTupleCount() > 0) {
+ FrameUtils.flushFrame(writerFrame, writer);
+ writerAppender.reset(writerFrame, true);
+ }
+
+ outAppender.reset(outFrame, true);
+ }
+
+ private void setNextTopTuple(int runIndex, int[] tupleIndices, RunFileReader[] runCursors,
+ FrameTupleAccessor[] tupleAccessors, ReferencedBucketBasedPriorityQueue topTuples)
+ throws HyracksDataException {
+ int runStart = runIndex * runFrameLimit;
+ boolean existNext = false;
+ if (tupleAccessors[currentFrameIndexInRun[runIndex]] == null || runCursors[runIndex] == null) {
+ /**
+ * run already closed
+ */
+ existNext = false;
+ } else if (currentFrameIndexInRun[runIndex] - runStart < currentRunFrames[runIndex] - 1) {
+ /**
+ * not the last frame for this run
+ */
+ existNext = true;
+ if (tupleIndices[runIndex] >= tupleAccessors[currentFrameIndexInRun[runIndex]].getTupleCount()) {
+ tupleIndices[runIndex] = 0;
+ currentFrameIndexInRun[runIndex]++;
+ }
+ } else if (tupleIndices[runIndex] < tupleAccessors[currentFrameIndexInRun[runIndex]].getTupleCount()) {
+ /**
+ * the last frame has expired
+ */
+ existNext = true;
+ } else {
+ /**
+ * If all tuples in the targeting frame have been
+ * checked.
+ */
+ tupleIndices[runIndex] = 0;
+ currentFrameIndexInRun[runIndex] = runStart;
+ /**
+ * read in batch
+ */
+ currentRunFrames[runIndex] = 0;
+ for (int j = 0; j < runFrameLimit; j++) {
+ int frameIndex = currentFrameIndexInRun[runIndex] + j;
+ if (runCursors[runIndex].nextFrame(inFrames.get(frameIndex))) {
+ tupleAccessors[frameIndex].reset(inFrames.get(frameIndex));
+ existNext = true;
+ currentRunFrames[runIndex]++;
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (existNext) {
+ topTuples.popAndReplace(tupleAccessors[currentFrameIndexInRun[runIndex]], tupleIndices[runIndex]);
+ } else {
+ topTuples.pop();
+ closeRun(runIndex, runCursors, tupleAccessors);
+ }
+ }
+
+ /**
+ * Close the run file, and also the corresponding readers and
+ * input frame.
+ *
+ * @param index
+ * @param runCursors
+ * @param tupleAccessor
+ * @throws HyracksDataException
+ */
+ private void closeRun(int index, RunFileReader[] runCursors, IFrameTupleAccessor[] tupleAccessor)
+ throws HyracksDataException {
+ if (runCursors[index] != null) {
+ runCursors[index].close();
+ runCursors[index] = null;
+ int frameOffset = index * runFrameLimit;
+ for (int j = 0; j < runFrameLimit; j++) {
+ tupleAccessor[frameOffset + j] = null;
+ }
+ }
+ }
+
+ private int compareFrameTuples(IFrameTupleAccessor fta1, int j1, IFrameTupleAccessor fta2, int j2) {
+ byte[] b1 = fta1.getBuffer().array();
+ byte[] b2 = fta2.getBuffer().array();
+ for (int f = 0; f < keyFields.length; ++f) {
+ int fIdx = f;
+ int s1 = fta1.getTupleStartOffset(j1) + fta1.getFieldSlotsLength() + fta1.getFieldStartOffset(j1, fIdx);
+ int l1 = fta1.getFieldLength(j1, fIdx);
+ int s2 = fta2.getTupleStartOffset(j2) + fta2.getFieldSlotsLength() + fta2.getFieldStartOffset(j2, fIdx);
+ int l2_start = fta2.getFieldStartOffset(j2, fIdx);
+ int l2_end = fta2.getFieldEndOffset(j2, fIdx);
+ int l2 = l2_end - l2_start;
+ int c = comparators[f].compare(b1, s1, l1, b2, s2, l2);
+ if (c != 0) {
+ return c;
+ }
+ }
+ return 0;
+ }
+
+ private Comparator<ReferenceHashEntry> createEntryComparator(final IBinaryComparator[] comparators) {
+ return new Comparator<ReferenceHashEntry>() {
+
+ @Override
+ public int compare(ReferenceHashEntry o1, ReferenceHashEntry o2) {
+ int cmp = o1.getHashValue() - o2.getHashValue();
+ if (cmp != 0) {
+ return cmp;
+ } else {
+ FrameTupleAccessor fta1 = (FrameTupleAccessor) o1.getAccessor();
+ FrameTupleAccessor fta2 = (FrameTupleAccessor) o2.getAccessor();
+ int j1 = o1.getTupleIndex();
+ int j2 = o2.getTupleIndex();
+ byte[] b1 = fta1.getBuffer().array();
+ byte[] b2 = fta2.getBuffer().array();
+ for (int f = 0; f < keyFields.length; ++f) {
+ int fIdx = f;
+ int s1 = fta1.getTupleStartOffset(j1) + fta1.getFieldSlotsLength()
+ + fta1.getFieldStartOffset(j1, fIdx);
+ int l1 = fta1.getFieldEndOffset(j1, fIdx) - fta1.getFieldStartOffset(j1, fIdx);
+ int s2 = fta2.getTupleStartOffset(j2) + fta2.getFieldSlotsLength()
+ + fta2.getFieldStartOffset(j2, fIdx);
+ int l2 = fta2.getFieldEndOffset(j2, fIdx) - fta2.getFieldStartOffset(j2, fIdx);
+ int c = comparators[f].compare(b1, s1, l1, b2, s2, l2);
+ if (c != 0) {
+ return c;
+ }
+ }
+ return 0;
+ }
+ }
+
+ };
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortRunMerger.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortRunMerger.java
new file mode 100644
index 0000000..a846e4a
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/HybridHashSortRunMerger.java
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import java.nio.ByteBuffer;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import edu.uci.ics.hyracks.api.comm.IFrameReader;
+import edu.uci.ics.hyracks.api.comm.IFrameWriter;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileReader;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileWriter;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+
+public class HybridHashSortRunMerger {
+
+ private final IHyracksTaskContext ctx;
+ private final List<RunFileReader> runs;
+ private final int[] keyFields;
+ private final IBinaryComparator[] comparators;
+ private final RecordDescriptor recordDesc;
+ private final int framesLimit;
+ private final int tableSize;
+ private final IFrameWriter writer;
+ private final IAggregatorDescriptor grouper;
+ private final ITuplePartitionComputer tpc;
+ private ByteBuffer outFrame;
+ private FrameTupleAppender outFrameAppender;
+ private final boolean isLoadBuffered;
+
+ public HybridHashSortRunMerger(IHyracksTaskContext ctx, LinkedList<RunFileReader> runs, int[] keyFields,
+ IBinaryComparator[] comparators, RecordDescriptor recordDesc, ITuplePartitionComputer tpc,
+ IAggregatorDescriptor grouper, int framesLimit, int tableSize, IFrameWriter writer, boolean isLoadBuffered) {
+ this.ctx = ctx;
+ this.runs = runs;
+ this.keyFields = keyFields;
+ this.comparators = comparators;
+ this.recordDesc = recordDesc;
+ this.framesLimit = framesLimit;
+ this.writer = writer;
+ this.isLoadBuffered = isLoadBuffered;
+ this.tableSize = tableSize;
+ this.tpc = tpc;
+ this.grouper = grouper;
+ }
+
+ public void process() throws HyracksDataException {
+
+ // FIXME
+ int mergeLevels = 0, mergeRunCount = 0;
+ try {
+
+ outFrame = ctx.allocateFrame();
+ outFrameAppender = new FrameTupleAppender(ctx.getFrameSize());
+ outFrameAppender.reset(outFrame, true);
+
+ int maxMergeWidth = framesLimit - 1;
+ while (runs.size() > maxMergeWidth) {
+ int generationSeparator = 0;
+ // FIXME
+ int mergeRounds = 0;
+ while (generationSeparator < runs.size() && runs.size() > maxMergeWidth) {
+ int mergeWidth = Math.min(Math.min(runs.size() - generationSeparator, maxMergeWidth), runs.size()
+ - maxMergeWidth + 1);
+ FileReference newRun = null;
+ IFrameWriter mergeResultWriter = this.writer;
+ newRun = ctx.createManagedWorkspaceFile(HybridHashSortRunMerger.class.getSimpleName());
+ mergeResultWriter = new RunFileWriter(newRun, ctx.getIOManager());
+ mergeResultWriter.open();
+ IFrameReader[] runCursors = new RunFileReader[mergeWidth];
+ for (int i = 0; i < mergeWidth; i++) {
+ runCursors[i] = runs.get(generationSeparator + i);
+ }
+ merge(mergeResultWriter, runCursors, false);
+ runs.subList(generationSeparator, generationSeparator + mergeWidth).clear();
+ runs.add(generationSeparator++, ((RunFileWriter) mergeResultWriter).createReader());
+ mergeRounds++;
+ }
+ mergeLevels++;
+ mergeRunCount += mergeRounds;
+ }
+ if (!runs.isEmpty()) {
+ IFrameReader[] runCursors = new RunFileReader[runs.size()];
+ for (int i = 0; i < runCursors.length; i++) {
+ runCursors[i] = runs.get(i);
+ }
+ merge(writer, runCursors, true);
+ }
+ } catch (Exception e) {
+ writer.fail();
+ throw new HyracksDataException(e);
+ } finally {
+
+ ctx.getCounterContext()
+ .getCounter("optional." + HybridHashSortRunMerger.class.getSimpleName() + ".merge.runs.count", true)
+ .set(mergeRunCount);
+
+ ctx.getCounterContext()
+ .getCounter("optional." + HybridHashSortRunMerger.class.getSimpleName() + ".merge.levels", true)
+ .set(mergeLevels);
+ }
+ }
+
+ private void merge(IFrameWriter mergeResultWriter, IFrameReader[] runCursors, boolean isFinal)
+ throws HyracksDataException {
+ // FIXME
+ long methodTimer = System.nanoTime();
+
+ IFrameReader merger = new GroupRunMergingFrameReader(ctx, runCursors, framesLimit, tableSize, keyFields, tpc,
+ comparators, grouper, recordDesc, isFinal, isLoadBuffered);
+ merger.open();
+ try {
+ while (merger.nextFrame(outFrame)) {
+ FrameUtils.flushFrame(outFrame, mergeResultWriter);
+ }
+ } finally {
+ merger.close();
+ }
+ ctx.getCounterContext()
+ .getCounter("optional." + HybridHashSortRunMerger.class.getSimpleName() + ".merge.time", true)
+ .update(System.nanoTime() - methodTimer);
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferenceEntryWithBucketID.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferenceEntryWithBucketID.java
new file mode 100644
index 0000000..3c91fea
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferenceEntryWithBucketID.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.std.util.ReferenceEntry;
+
+public class ReferenceEntryWithBucketID extends ReferenceEntry {
+
+ private int bucketID;
+
+ public ReferenceEntryWithBucketID(int runid, FrameTupleAccessor fta, int tupleIndex, int bucketID) {
+ super(runid, fta, tupleIndex);
+ this.bucketID = bucketID;
+ }
+
+ public int getBucketID() {
+ return bucketID;
+ }
+
+ public void setBucketID(int bucketID) {
+ this.bucketID = bucketID;
+ }
+
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferenceHashEntry.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferenceHashEntry.java
new file mode 100644
index 0000000..394f0a8
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferenceHashEntry.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.std.util.ReferenceEntry;
+
+public class ReferenceHashEntry extends ReferenceEntry {
+
+ private int hashValue;
+
+ public ReferenceHashEntry(int runid, FrameTupleAccessor fta, int tupleIndex, int hashVal) {
+ super(runid, fta, tupleIndex);
+ this.hashValue = hashVal;
+ }
+
+ public int getHashValue() {
+ return hashValue;
+ }
+
+ public void setHashValue(int hashVal) {
+ this.hashValue = hashVal;
+ }
+
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferencedBucketBasedPriorityQueue.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferencedBucketBasedPriorityQueue.java
new file mode 100644
index 0000000..adfbe81
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferencedBucketBasedPriorityQueue.java
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.Comparator;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.std.util.ReferenceEntry;
+
+public class ReferencedBucketBasedPriorityQueue {
+
+ private final int frameSize;
+ private final RecordDescriptor recordDescriptor;
+ private final ReferenceHashEntry entries[];
+ private final int size;
+ private final BitSet runAvail;
+ private int nItems;
+ private final int tableSize;
+
+ private final Comparator<ReferenceHashEntry> comparator;
+
+ private final ITuplePartitionComputer tpc;
+
+ public ReferencedBucketBasedPriorityQueue(int frameSize, RecordDescriptor recordDescriptor, int initSize,
+ Comparator<ReferenceHashEntry> comparator, ITuplePartitionComputer tpc, int tableSize) {
+ this.frameSize = frameSize;
+ this.recordDescriptor = recordDescriptor;
+ if (initSize < 1)
+ throw new IllegalArgumentException();
+ this.comparator = comparator;
+ nItems = initSize;
+ size = (initSize + 1) & 0xfffffffe;
+ entries = new ReferenceHashEntry[size];
+ runAvail = new BitSet(size);
+ runAvail.set(0, initSize, true);
+ for (int i = 0; i < size; i++) {
+ entries[i] = new ReferenceHashEntry(i, null, -1, -1);
+ }
+ this.tpc = tpc;
+ this.tableSize = tableSize;
+ }
+
+ /**
+ * Retrieve the top entry without removing it
+ *
+ * @return the top entry
+ */
+ public ReferenceEntry peek() {
+ return entries[0];
+ }
+
+ /**
+ * compare the new entry with entries within the queue, to find a spot for
+ * this new entry
+ *
+ * @param entry
+ * @return runid of this entry
+ * @throws HyracksDataException
+ * @throws IOException
+ */
+ public int popAndReplace(FrameTupleAccessor fta, int tIndex) throws HyracksDataException {
+ ReferenceHashEntry entry = entries[0];
+ if (entry.getAccessor() == null) {
+ entry.setAccessor(new FrameTupleAccessor(frameSize, recordDescriptor));
+ }
+ entry.getAccessor().reset(fta.getBuffer());
+ entry.setTupleIndex(tIndex);
+ entry.setHashValue(tpc.partition(fta, tIndex, tableSize));
+
+ add(entry);
+ return entry.getRunid();
+ }
+
+ /**
+ * Push entry into priority queue
+ *
+ * @param e
+ * the new Entry
+ * @throws HyracksDataException
+ */
+ private void add(ReferenceHashEntry e) throws HyracksDataException {
+ ReferenceHashEntry min = entries[0];
+ int slot = (size >> 1) + (min.getRunid() >> 1);
+
+ ReferenceHashEntry curr = e;
+ while (!runAvail.isEmpty() && slot > 0) {
+ int c = 0;
+ if (!runAvail.get(entries[slot].getRunid())) {
+ // run of entries[slot] is exhausted, i.e. not available, curr
+ // wins
+ c = 1;
+ } else if (entries[slot].getAccessor() != null /*
+ * entries[slot] is
+ * not MIN value
+ */
+ && runAvail.get(curr.getRunid() /* curr run is available */)) {
+
+ if (curr.getAccessor() != null) {
+ c = comparator.compare(entries[slot], curr);
+ } else {
+ // curr is MIN value, wins
+ c = 1;
+ }
+ }
+
+ if (c <= 0) { // curr lost
+ // entries[slot] swaps up
+ ReferenceHashEntry tmp = entries[slot];
+ entries[slot] = curr;
+ curr = tmp;// winner to pass up
+ }// else curr wins
+ slot >>= 1;
+ }
+ // set new entries[0]
+ entries[0] = curr;
+ }
+
+ /**
+ * Pop is called only when a run is exhausted
+ *
+ * @return
+ * @throws HyracksDataException
+ */
+ public ReferenceHashEntry pop() throws HyracksDataException {
+ ReferenceHashEntry min = entries[0];
+ runAvail.clear(min.getRunid());
+ add(min);
+ nItems--;
+ return min;
+ }
+
+ public boolean areRunsExhausted() {
+ return runAvail.isEmpty();
+ }
+
+ public int size() {
+ return nItems;
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferencedPriorityQueue.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferencedPriorityQueue.java
new file mode 100644
index 0000000..d9d5118
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hashsort/ReferencedPriorityQueue.java
@@ -0,0 +1,133 @@
+package edu.uci.ics.hyracks.dataflow.std.group.hashsort;
+
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.Comparator;
+
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+
+/**
+ * TODO need to be merged with the ReferencedPriorityQueue in the util package
+ */
+public class ReferencedPriorityQueue {
+ private final int frameSize;
+ private final RecordDescriptor recordDescriptor;
+ private final ReferenceEntryWithBucketID entries[];
+ private final int size;
+ private final BitSet runAvail;
+ private int nItems;
+
+ private final Comparator<ReferenceEntryWithBucketID> comparator;
+
+ public ReferencedPriorityQueue(int frameSize, RecordDescriptor recordDescriptor, int initSize,
+ Comparator<ReferenceEntryWithBucketID> comparator) {
+ this.frameSize = frameSize;
+ this.recordDescriptor = recordDescriptor;
+ if (initSize < 1)
+ throw new IllegalArgumentException();
+ this.comparator = comparator;
+ nItems = initSize;
+ size = (initSize + 1) & 0xfffffffe;
+ entries = new ReferenceEntryWithBucketID[size];
+ runAvail = new BitSet(size);
+ runAvail.set(0, initSize, true);
+ for (int i = 0; i < size; i++) {
+ entries[i] = new ReferenceEntryWithBucketID(i, null, -1, -1);
+ }
+ }
+
+ /**
+ * Retrieve the top entry without removing it
+ *
+ * @return the top entry
+ */
+ public ReferenceEntryWithBucketID peek() {
+ return entries[0];
+ }
+
+ /**
+ * compare the new entry with entries within the queue, to find a spot for
+ * this new entry
+ *
+ * @param entry
+ * @return runid of this entry
+ * @throws IOException
+ */
+ public int popAndReplace(FrameTupleAccessor fta, int tIndex, int bucketID) {
+ ReferenceEntryWithBucketID entry = entries[0];
+ if (entry.getAccessor() == null) {
+ entry.setAccessor(new FrameTupleAccessor(frameSize, recordDescriptor));
+ }
+ entry.getAccessor().reset(fta.getBuffer());
+ entry.setTupleIndex(tIndex);
+ entry.setBucketID(bucketID);
+
+ add(entry);
+ return entry.getRunid();
+ }
+
+ /**
+ * Push entry into priority queue
+ *
+ * @param e
+ * the new Entry
+ */
+ private void add(ReferenceEntryWithBucketID e) {
+ ReferenceEntryWithBucketID min = entries[0];
+ int slot = (size >> 1) + (min.getRunid() >> 1);
+
+ ReferenceEntryWithBucketID curr = e;
+ while (!runAvail.isEmpty() && slot > 0) {
+ int c = 0;
+ if (!runAvail.get(entries[slot].getRunid())) {
+ // run of entries[slot] is exhausted, i.e. not available, curr
+ // wins
+ c = 1;
+ } else if (entries[slot].getAccessor() != null /*
+ * entries[slot] is
+ * not MIN value
+ */
+ && runAvail.get(curr.getRunid() /* curr run is available */)) {
+
+ if (curr.getAccessor() != null) {
+ c = comparator.compare(entries[slot], curr);
+ } else {
+ // curr is MIN value, wins
+ c = 1;
+ }
+ }
+
+ if (c <= 0) { // curr lost
+ // entries[slot] swaps up
+ ReferenceEntryWithBucketID tmp = entries[slot];
+ entries[slot] = curr;
+ curr = tmp;// winner to pass up
+ }// else curr wins
+ slot >>= 1;
+ }
+ // set new entries[0]
+ entries[0] = curr;
+ }
+
+ /**
+ * Pop is called only when a run is exhausted
+ *
+ * @return
+ */
+ public ReferenceEntryWithBucketID pop() {
+ ReferenceEntryWithBucketID min = entries[0];
+ runAvail.clear(min.getRunid());
+ add(min);
+ nItems--;
+ return min;
+ }
+
+ public boolean areRunsExhausted() {
+ return runAvail.isEmpty();
+ }
+
+ public int size() {
+ return nItems;
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/FrameTupleAccessorForGroupHashtable.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/FrameTupleAccessorForGroupHashtable.java
new file mode 100644
index 0000000..72bae76
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/FrameTupleAccessorForGroupHashtable.java
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hybridhash;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.comm.FrameHelper;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+
+public class FrameTupleAccessorForGroupHashtable implements IFrameTupleAccessor {
+ private final int frameSize;
+ private final RecordDescriptor recordDescriptor;
+
+ private final static int INT_SIZE = 4;
+
+ private ByteBuffer buffer;
+
+ public FrameTupleAccessorForGroupHashtable(int frameSize, RecordDescriptor recordDescriptor) {
+ this.frameSize = frameSize;
+ this.recordDescriptor = recordDescriptor;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getFieldCount()
+ */
+ @Override
+ public int getFieldCount() {
+ return recordDescriptor.getFieldCount();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getFieldSlotsLength()
+ */
+ @Override
+ public int getFieldSlotsLength() {
+ return getFieldCount() * 4;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getFieldEndOffset(int, int)
+ */
+ @Override
+ public int getFieldEndOffset(int tupleIndex, int fIdx) {
+ return buffer.getInt(getTupleStartOffset(tupleIndex) + fIdx * 4);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getFieldStartOffset(int, int)
+ */
+ @Override
+ public int getFieldStartOffset(int tupleIndex, int fIdx) {
+ return fIdx == 0 ? 0 : buffer.getInt(getTupleStartOffset(tupleIndex) + (fIdx - 1) * 4);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getFieldLength(int, int)
+ */
+ @Override
+ public int getFieldLength(int tupleIndex, int fIdx) {
+ return getFieldEndOffset(tupleIndex, fIdx) - getFieldStartOffset(tupleIndex, fIdx);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getTupleEndOffset(int)
+ */
+ @Override
+ public int getTupleEndOffset(int tupleIndex) {
+ return buffer.getInt(FrameHelper.getTupleCountOffset(frameSize) - 4 * (tupleIndex + 1)) - 2 * INT_SIZE;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getTupleStartOffset(int)
+ */
+ @Override
+ public int getTupleStartOffset(int tupleIndex) {
+ return tupleIndex == 0 ? 0 : buffer.getInt(FrameHelper.getTupleCountOffset(frameSize) - 4 * tupleIndex);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getTupleCount()
+ */
+ @Override
+ public int getTupleCount() {
+ return buffer.getInt(FrameHelper.getTupleCountOffset(frameSize));
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#getBuffer()
+ */
+ @Override
+ public ByteBuffer getBuffer() {
+ return buffer;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor#reset(java.nio.ByteBuffer)
+ */
+ @Override
+ public void reset(ByteBuffer buffer) {
+ this.buffer = buffer;
+ }
+
+ public int getTupleHashReferenceOffset(int tupleIndex) {
+ return getTupleEndOffset(tupleIndex);
+ }
+
+ public int getTupleEndOffsetWithHashReference(int tupleIndex) {
+ return buffer.getInt(FrameHelper.getTupleCountOffset(frameSize) - 4 * (tupleIndex + 1));
+ }
+
+ public int getHashReferenceNextFrameIndex(int tupleIndex) {
+ return buffer.getInt(getTupleHashReferenceOffset(tupleIndex));
+ }
+
+ public int getHashReferenceNextTupleIndex(int tupleIndex) {
+ return buffer.getInt(getTupleHashReferenceOffset(tupleIndex) + INT_SIZE);
+ }
+
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/FrameTupleAppenderForGroupHashtable.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/FrameTupleAppenderForGroupHashtable.java
new file mode 100644
index 0000000..c5668f5
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/FrameTupleAppenderForGroupHashtable.java
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hybridhash;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.comm.FrameHelper;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+
+public class FrameTupleAppenderForGroupHashtable {
+ private final int frameSize;
+
+ private ByteBuffer buffer;
+
+ private int tupleCount;
+
+ private int tupleDataEndOffset;
+
+ public FrameTupleAppenderForGroupHashtable(int frameSize) {
+ this.frameSize = frameSize;
+ }
+
+ public void reset(ByteBuffer buffer, boolean clear) {
+ this.buffer = buffer;
+ if (clear) {
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize), 0);
+ tupleCount = 0;
+ tupleDataEndOffset = 0;
+ } else {
+ tupleCount = buffer.getInt(FrameHelper.getTupleCountOffset(frameSize));
+ tupleDataEndOffset = tupleCount == 0 ? 0 : buffer.getInt(FrameHelper.getTupleCountOffset(frameSize)
+ - tupleCount * 4);
+ }
+ }
+
+ public boolean append(int[] fieldSlots, byte[] bytes, int offset, int length) {
+ if (tupleDataEndOffset + fieldSlots.length * 4 + length + 2 * 4 + 4 + (tupleCount + 1) * 4 <= frameSize) {
+ for (int i = 0; i < fieldSlots.length; ++i) {
+ buffer.putInt(tupleDataEndOffset + i * 4, fieldSlots[i]);
+ }
+ System.arraycopy(bytes, offset, buffer.array(), tupleDataEndOffset + fieldSlots.length * 4, length);
+ buffer.putInt(tupleDataEndOffset + fieldSlots.length * 4 + length, -1);
+ buffer.putInt(tupleDataEndOffset + fieldSlots.length * 4 + length + 4, -1);
+ tupleDataEndOffset += fieldSlots.length * 4 + length + 2 * 4;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize) - 4 * (tupleCount + 1), tupleDataEndOffset);
+ ++tupleCount;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize), tupleCount);
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(byte[] bytes, int offset, int length) {
+ if (tupleDataEndOffset + length + 2 * 4 + 4 + (tupleCount + 1) * 4 <= frameSize) {
+ System.arraycopy(bytes, offset, buffer.array(), tupleDataEndOffset, length);
+ buffer.putInt(tupleDataEndOffset + length, -1);
+ buffer.putInt(tupleDataEndOffset + length + 4, -1);
+ tupleDataEndOffset += length + 2 * 4;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize) - 4 * (tupleCount + 1), tupleDataEndOffset);
+ ++tupleCount;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize), tupleCount);
+ return true;
+ }
+ return false;
+ }
+
+ public boolean appendSkipEmptyField(int[] fieldSlots, byte[] bytes, int offset, int length) {
+ if (tupleDataEndOffset + fieldSlots.length * 4 + length + 2 * 4 + 4 + (tupleCount + 1) * 4 <= frameSize) {
+ int effectiveSlots = 0;
+ for (int i = 0; i < fieldSlots.length; ++i) {
+ if (fieldSlots[i] > 0) {
+ buffer.putInt(tupleDataEndOffset + i * 4, fieldSlots[i]);
+ effectiveSlots++;
+ }
+ }
+ System.arraycopy(bytes, offset, buffer.array(), tupleDataEndOffset + effectiveSlots * 4, length);
+ buffer.putInt(tupleDataEndOffset + effectiveSlots * 4 + length, -1);
+ buffer.putInt(tupleDataEndOffset + effectiveSlots * 4 + length + 4, -1);
+ tupleDataEndOffset += effectiveSlots * 4 + length + 2 * 4;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize) - 4 * (tupleCount + 1), tupleDataEndOffset);
+ ++tupleCount;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize), tupleCount);
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(IFrameTupleAccessor tupleAccessor, int tStartOffset, int tEndOffset) {
+ int length = tEndOffset - tStartOffset;
+ if (tupleDataEndOffset + length + 2 * 4 + 4 + (tupleCount + 1) * 4 <= frameSize) {
+ ByteBuffer src = tupleAccessor.getBuffer();
+ System.arraycopy(src.array(), tStartOffset, buffer.array(), tupleDataEndOffset, length);
+ buffer.putInt(tupleDataEndOffset + length, -1);
+ buffer.putInt(tupleDataEndOffset + length + 4, -1);
+ tupleDataEndOffset += length + 2 * 4;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize) - 4 * (tupleCount + 1), tupleDataEndOffset);
+ ++tupleCount;
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize), tupleCount);
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(IFrameTupleAccessor tupleAccessor, int tIndex) {
+ int tStartOffset = tupleAccessor.getTupleStartOffset(tIndex);
+ int tEndOffset = tupleAccessor.getTupleEndOffset(tIndex);
+ return append(tupleAccessor, tStartOffset, tEndOffset);
+ }
+
+ public int getTupleCount() {
+ return tupleCount;
+ }
+
+ public ByteBuffer getBuffer() {
+ return buffer;
+ }
+}
+
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashGroupHashTable.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashGroupHashTable.java
new file mode 100644
index 0000000..b325b83
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashGroupHashTable.java
@@ -0,0 +1,609 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hybridhash;
+
+import java.nio.ByteBuffer;
+import java.util.LinkedList;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameReader;
+import edu.uci.ics.hyracks.api.comm.IFrameWriter;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputerFamily;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileWriter;
+import edu.uci.ics.hyracks.dataflow.std.group.AggregateState;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.structures.TuplePointer;
+
+public class HybridHashGroupHashTable implements IFrameWriter {
+
+ private final static int HEADER_REF_EMPTY = -1;
+
+ private static final int INT_SIZE = 4;
+
+ private IHyracksTaskContext ctx;
+
+ private final int frameSize;
+
+ private final int framesLimit;
+
+ private final int tableSize;
+
+ private final int numOfPartitions;
+
+ private final IFrameWriter outputWriter;
+
+ private final IBinaryComparator[] comparators;
+
+ /**
+ * index for keys
+ */
+ private final int[] inputKeys, internalKeys;
+
+ private final RecordDescriptor inputRecordDescriptor, outputRecordDescriptor;
+
+ /**
+ * hash partitioner for hashing
+ */
+ private final ITuplePartitionComputer hashComputer;
+
+ /**
+ * hash partitioner for partitioning
+ */
+ private final ITuplePartitionComputer partitionComputer;
+
+ /**
+ * Hashtable haders
+ */
+ private ByteBuffer[] headers;
+
+ /**
+ * buffers for hash table
+ */
+ private ByteBuffer[] contents;
+
+ /**
+ * output buffers for spilled partitions
+ */
+ private ByteBuffer[] spilledPartOutputBuffers;
+
+ /**
+ * run writers for spilled partitions
+ */
+ private RunFileWriter[] spilledPartRunWriters;
+
+ private int[] spilledPartRunSizeArrayInFrames;
+ private int[] spilledPartRunSizeArrayInTuples;
+
+ private List<IFrameReader> spilledPartRunReaders = null;
+ private List<Integer> spilledRunAggregatedPages = null;
+ private List<Integer> spilledPartRunSizesInFrames = null;
+ private List<Integer> spilledPartRunSizesInTuples = null;
+
+ /**
+ * index of the current working buffer in hash table
+ */
+ private int currentHashTableFrame;
+
+ /**
+ * Aggregation state
+ */
+ private AggregateState htAggregateState;
+
+ /**
+ * the aggregator
+ */
+ private final IAggregatorDescriptor aggregator;
+
+ /**
+ * records inserted into the in-memory hash table (for hashing and aggregation)
+ */
+ private int hashedRawRecords = 0;
+
+ /**
+ * in-memory part size in tuples
+ */
+ private int hashedKeys = 0;
+
+ /**
+ * Hash table tuple pointer
+ */
+ private TuplePointer matchPointer;
+
+ /**
+ * Frame tuple accessor for input data frames
+ */
+ private FrameTupleAccessor inputFrameTupleAccessor;
+
+ /**
+ * flag for whether the hash table if full
+ */
+ private boolean isHashtableFull;
+
+ /**
+ * flag for only partition (no aggregation and hashing)
+ */
+ private boolean isPartitionOnly;
+
+ /**
+ * Tuple accessor for hash table contents
+ */
+ private FrameTupleAccessorForGroupHashtable hashtableRecordAccessor;
+
+ private ArrayTupleBuilder internalTupleBuilder;
+
+ private FrameTupleAppender spilledPartInsertAppender;
+
+ private FrameTupleAppenderForGroupHashtable htInsertAppender;
+
+ public HybridHashGroupHashTable(IHyracksTaskContext ctx, int framesLimit, int tableSize, int numOfPartitions,
+ int[] keys, int hashSeedOffset, IBinaryComparator[] comparators, ITuplePartitionComputerFamily tpcFamily,
+ IAggregatorDescriptor aggregator, RecordDescriptor inputRecordDescriptor,
+ RecordDescriptor outputRecordDescriptor, IFrameWriter outputWriter) throws HyracksDataException {
+ this.ctx = ctx;
+ this.frameSize = ctx.getFrameSize();
+ this.tableSize = tableSize;
+ this.framesLimit = framesLimit;
+ this.numOfPartitions = numOfPartitions;
+ this.inputKeys = keys;
+ this.internalKeys = new int[keys.length];
+ for (int i = 0; i < internalKeys.length; i++) {
+ internalKeys[i] = i;
+ }
+
+ this.comparators = comparators;
+
+ this.inputRecordDescriptor = inputRecordDescriptor;
+ this.outputRecordDescriptor = outputRecordDescriptor;
+
+ this.outputWriter = outputWriter;
+
+ this.hashComputer = tpcFamily.createPartitioner(hashSeedOffset * 2);
+ this.partitionComputer = tpcFamily.createPartitioner(hashSeedOffset * 2 + 1);
+
+ this.aggregator = aggregator;
+
+ }
+
+ public static double getHashtableOverheadRatio(int tableSize, int frameSize, int framesLimit, int recordSizeInByte) {
+ int pagesForRecord = framesLimit - getHeaderPages(tableSize, frameSize);
+ int recordsInHashtable = (pagesForRecord - 1) * ((int) (frameSize / (recordSizeInByte + 2 * INT_SIZE)));
+
+ return (double) framesLimit * frameSize / recordsInHashtable / recordSizeInByte;
+ }
+
+ public static int getHeaderPages(int tableSize, int frameSize) {
+ return (int) Math.ceil((double)tableSize * INT_SIZE * 2 / frameSize);
+ }
+
+ @Override
+ public void open() throws HyracksDataException {
+ // initialize hash headers
+ int htHeaderCount = getHeaderPages(tableSize, frameSize);
+
+ isPartitionOnly = false;
+ if (numOfPartitions >= framesLimit - htHeaderCount) {
+ isPartitionOnly = true;
+ }
+
+ if (isPartitionOnly) {
+ htHeaderCount = 0;
+ }
+
+ headers = new ByteBuffer[htHeaderCount];
+
+ // initialize hash table contents
+ contents = new ByteBuffer[framesLimit - htHeaderCount - numOfPartitions];
+ currentHashTableFrame = 0;
+ isHashtableFull = false;
+
+ // initialize hash table aggregate state
+ htAggregateState = aggregator.createAggregateStates();
+
+ // initialize partition information
+ spilledPartOutputBuffers = new ByteBuffer[numOfPartitions];
+ spilledPartRunWriters = new RunFileWriter[numOfPartitions];
+ spilledPartRunSizeArrayInFrames = new int[numOfPartitions];
+ spilledPartRunSizeArrayInTuples = new int[numOfPartitions];
+
+ // initialize other helper classes
+ inputFrameTupleAccessor = new FrameTupleAccessor(frameSize, inputRecordDescriptor);
+ internalTupleBuilder = new ArrayTupleBuilder(outputRecordDescriptor.getFieldCount());
+ spilledPartInsertAppender = new FrameTupleAppender(frameSize);
+
+ htInsertAppender = new FrameTupleAppenderForGroupHashtable(frameSize);
+ matchPointer = new TuplePointer();
+ hashtableRecordAccessor = new FrameTupleAccessorForGroupHashtable(frameSize, outputRecordDescriptor);
+ }
+
+ @Override
+ public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
+ inputFrameTupleAccessor.reset(buffer);
+ int tupleCount = inputFrameTupleAccessor.getTupleCount();
+ for (int i = 0; i < tupleCount; i++) {
+ insert(inputFrameTupleAccessor, i);
+ }
+ }
+
+ @Override
+ public void fail() throws HyracksDataException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void close() throws HyracksDataException {
+ for (int i = 0; i < numOfPartitions; i++) {
+ if (spilledPartRunWriters[i] != null) {
+ spilledPartRunWriters[i].close();
+ }
+ }
+ htAggregateState.close();
+ }
+
+ private void insert(FrameTupleAccessor accessor, int tupleIndex) throws HyracksDataException {
+
+ if (isPartitionOnly) {
+ // for partition only
+ int pid = partitionComputer.partition(accessor, tupleIndex, tableSize) % numOfPartitions;
+ insertSpilledPartition(accessor, tupleIndex, pid);
+ spilledPartRunSizeArrayInTuples[pid]++;
+ return;
+ }
+
+ int hid = hashComputer.partition(accessor, tupleIndex, tableSize);
+
+ if (findMatch(hid, accessor, tupleIndex)) {
+ // found a matching: do aggregation
+ hashtableRecordAccessor.reset(contents[matchPointer.frameIndex]);
+ aggregator.aggregate(accessor, tupleIndex, hashtableRecordAccessor, matchPointer.tupleIndex,
+ htAggregateState);
+ hashedRawRecords++;
+ } else {
+ if (isHashtableFull) {
+ // when hash table is full: spill the record
+ int pid = partitionComputer.partition(accessor, tupleIndex, tableSize) % numOfPartitions;
+ insertSpilledPartition(accessor, tupleIndex, pid);
+ spilledPartRunSizeArrayInTuples[pid]++;
+ } else {
+ // insert a new entry into the hash table
+ internalTupleBuilder.reset();
+ for (int k = 0; k < inputKeys.length; k++) {
+ internalTupleBuilder.addField(accessor, tupleIndex, inputKeys[k]);
+ }
+
+ aggregator.init(internalTupleBuilder, accessor, tupleIndex, htAggregateState);
+
+ if (contents[currentHashTableFrame] == null) {
+ contents[currentHashTableFrame] = ctx.allocateFrame();
+ }
+
+ htInsertAppender.reset(contents[currentHashTableFrame], false);
+ if (!htInsertAppender.append(internalTupleBuilder.getFieldEndOffsets(),
+ internalTupleBuilder.getByteArray(), 0, internalTupleBuilder.getSize())) {
+ // hash table is full: try to allocate more frame
+ currentHashTableFrame++;
+ if (currentHashTableFrame >= contents.length) {
+ // no more frame to allocate: stop expending the hash table
+ isHashtableFull = true;
+
+ // reinsert the record
+ insert(accessor, tupleIndex);
+
+ return;
+ } else {
+ if (contents[currentHashTableFrame] == null) {
+ contents[currentHashTableFrame] = ctx.allocateFrame();
+ }
+
+ htInsertAppender.reset(contents[currentHashTableFrame], true);
+
+ if (!htInsertAppender.append(internalTupleBuilder.getFieldEndOffsets(),
+ internalTupleBuilder.getByteArray(), 0, internalTupleBuilder.getSize())) {
+ throw new HyracksDataException(
+ "Failed to insert an aggregation partial result into the in-memory hash table: it has the length of "
+ + internalTupleBuilder.getSize() + " and fields "
+ + internalTupleBuilder.getFieldEndOffsets().length);
+ }
+
+ }
+ }
+
+ // update hash table reference
+ if (matchPointer.frameIndex < 0) {
+ // need to initialize the hash table header
+ int headerFrameIndex = getHeaderFrameIndex(hid);
+ int headerFrameOffset = getHeaderTupleIndex(hid);
+
+ if (headers[headerFrameIndex] == null) {
+ headers[headerFrameIndex] = ctx.allocateFrame();
+ resetHeader(headerFrameIndex);
+ }
+
+ headers[headerFrameIndex].putInt(headerFrameOffset, currentHashTableFrame);
+ headers[headerFrameIndex]
+ .putInt(headerFrameOffset + INT_SIZE, htInsertAppender.getTupleCount() - 1);
+ } else {
+ // update the previous reference
+ hashtableRecordAccessor.reset(contents[matchPointer.frameIndex]);
+ int refOffset = hashtableRecordAccessor.getTupleHashReferenceOffset(matchPointer.tupleIndex);
+ contents[matchPointer.frameIndex].putInt(refOffset, currentHashTableFrame);
+ contents[matchPointer.frameIndex]
+ .putInt(refOffset + INT_SIZE, htInsertAppender.getTupleCount() - 1);
+ }
+
+ hashedKeys++;
+ hashedRawRecords++;
+ }
+ }
+ }
+
+ /**
+ * Insert record into a spilled partition, by directly copying the tuple into the output buffer.
+ *
+ * @param accessor
+ * @param tupleIndex
+ * @param pid
+ */
+ private void insertSpilledPartition(FrameTupleAccessor accessor, int tupleIndex, int pid)
+ throws HyracksDataException {
+
+ if (spilledPartOutputBuffers[pid] == null) {
+ spilledPartOutputBuffers[pid] = ctx.allocateFrame();
+ }
+
+ spilledPartInsertAppender.reset(spilledPartOutputBuffers[pid], false);
+
+ if (!spilledPartInsertAppender.append(accessor, tupleIndex)) {
+ // the output buffer is full: flush
+ flushSpilledPartitionOutputBuffer(pid);
+ // reset the output buffer
+ spilledPartInsertAppender.reset(spilledPartOutputBuffers[pid], true);
+
+ if (!spilledPartInsertAppender.append(accessor, tupleIndex)) {
+ throw new HyracksDataException("Failed to insert a record into a spilled partition!");
+ }
+ }
+
+ }
+
+ /**
+ * Flush a spilled partition's output buffer.
+ *
+ * @param pid
+ * @throws HyracksDataException
+ */
+ private void flushSpilledPartitionOutputBuffer(int pid) throws HyracksDataException {
+ if (spilledPartRunWriters[pid] == null) {
+ spilledPartRunWriters[pid] = new RunFileWriter(
+ ctx.createManagedWorkspaceFile("HashHashPrePartitionHashTable"), ctx.getIOManager());
+ spilledPartRunWriters[pid].open();
+ }
+
+ FrameUtils.flushFrame(spilledPartOutputBuffers[pid], spilledPartRunWriters[pid]);
+
+ spilledPartRunSizeArrayInFrames[pid]++;
+ }
+
+ /**
+ * Hash table lookup
+ *
+ * @param hid
+ * @param accessor
+ * @param tupleIndex
+ * @return
+ */
+ private boolean findMatch(int hid, FrameTupleAccessor accessor, int tupleIndex) {
+
+ matchPointer.frameIndex = -1;
+ matchPointer.tupleIndex = -1;
+
+ // get reference in the header
+ int headerFrameIndex = getHeaderFrameIndex(hid);
+ int headerFrameOffset = getHeaderTupleIndex(hid);
+
+ if (headers[headerFrameIndex] == null) {
+ return false;
+ }
+
+ // initialize the pointer to the first record
+ int entryFrameIndex = headers[headerFrameIndex].getInt(headerFrameOffset);
+ int entryTupleIndex = headers[headerFrameIndex].getInt(headerFrameOffset + INT_SIZE);
+
+ while (entryFrameIndex >= 0) {
+ matchPointer.frameIndex = entryFrameIndex;
+ matchPointer.tupleIndex = entryTupleIndex;
+ hashtableRecordAccessor.reset(contents[entryFrameIndex]);
+ if (compare(accessor, tupleIndex, hashtableRecordAccessor, entryTupleIndex) == 0) {
+ return true;
+ }
+ // Move to the next record in this entry following the linked list
+ int refOffset = hashtableRecordAccessor.getTupleHashReferenceOffset(entryTupleIndex);
+ int prevFrameIndex = entryFrameIndex;
+ entryFrameIndex = contents[prevFrameIndex].getInt(refOffset);
+ entryTupleIndex = contents[prevFrameIndex].getInt(refOffset + INT_SIZE);
+ }
+
+ return false;
+ }
+
+ public void finishup() throws HyracksDataException {
+ // spill all output buffers
+ for (int i = 0; i < numOfPartitions; i++) {
+ if (spilledPartOutputBuffers[i] != null) {
+ flushSpilledPartitionOutputBuffer(i);
+ }
+ }
+ spilledPartOutputBuffers = null;
+
+ // flush in-memory aggregation results: no more frame cost here as all output buffers are recycled
+ ByteBuffer outputBuffer = ctx.allocateFrame();
+ FrameTupleAppender outputBufferAppender = new FrameTupleAppender(frameSize);
+ outputBufferAppender.reset(outputBuffer, true);
+
+ ArrayTupleBuilder outFlushTupleBuilder = new ArrayTupleBuilder(outputRecordDescriptor.getFieldCount());
+
+ for (ByteBuffer htFrame : contents) {
+ if (htFrame == null) {
+ continue;
+ }
+ hashtableRecordAccessor.reset(htFrame);
+ int tupleCount = hashtableRecordAccessor.getTupleCount();
+ for (int i = 0; i < tupleCount; i++) {
+ outFlushTupleBuilder.reset();
+
+ for (int k = 0; k < internalKeys.length; k++) {
+ outFlushTupleBuilder.addField(hashtableRecordAccessor, i, internalKeys[k]);
+ }
+
+ aggregator.outputFinalResult(outFlushTupleBuilder, hashtableRecordAccessor, i, htAggregateState);
+
+ if (!outputBufferAppender.append(outFlushTupleBuilder.getFieldEndOffsets(),
+ outFlushTupleBuilder.getByteArray(), 0, outFlushTupleBuilder.getSize())) {
+ FrameUtils.flushFrame(outputBuffer, outputWriter);
+ outputBufferAppender.reset(outputBuffer, true);
+
+ if (!outputBufferAppender.append(outFlushTupleBuilder.getFieldEndOffsets(),
+ outFlushTupleBuilder.getByteArray(), 0, outFlushTupleBuilder.getSize())) {
+ throw new HyracksDataException(
+ "Failed to flush a record from in-memory hash table: record has length of "
+ + outFlushTupleBuilder.getSize() + " and fields "
+ + outFlushTupleBuilder.getFieldEndOffsets().length);
+ }
+ }
+ }
+ }
+
+ if (outputBufferAppender.getTupleCount() > 0) {
+ FrameUtils.flushFrame(outputBuffer, outputWriter);
+ }
+
+ // create run readers and statistic information for spilled runs
+ spilledPartRunReaders = new LinkedList<IFrameReader>();
+ spilledRunAggregatedPages = new LinkedList<Integer>();
+ spilledPartRunSizesInFrames = new LinkedList<Integer>();
+ spilledPartRunSizesInTuples = new LinkedList<Integer>();
+ for (int i = 0; i < numOfPartitions; i++) {
+ if (spilledPartRunWriters[i] != null) {
+ spilledPartRunReaders.add(spilledPartRunWriters[i].createReader());
+ spilledRunAggregatedPages.add(0);
+ spilledPartRunWriters[i].close();
+ spilledPartRunSizesInFrames.add(spilledPartRunSizeArrayInFrames[i]);
+ spilledPartRunSizesInTuples.add(spilledPartRunSizeArrayInTuples[i]);
+ }
+ }
+ }
+
+ /**
+ * Compare an input record with a hash table entry.
+ *
+ * @param accessor
+ * @param tupleIndex
+ * @param hashAccessor
+ * @param hashTupleIndex
+ * @return
+ */
+ private int compare(FrameTupleAccessor accessor, int tupleIndex, FrameTupleAccessorForGroupHashtable hashAccessor,
+ int hashTupleIndex) {
+ int tStart0 = accessor.getTupleStartOffset(tupleIndex);
+ int fStartOffset0 = accessor.getFieldSlotsLength() + tStart0;
+
+ int tStart1 = hashAccessor.getTupleStartOffset(hashTupleIndex);
+ int fStartOffset1 = hashAccessor.getFieldSlotsLength() + tStart1;
+
+ for (int i = 0; i < internalKeys.length; ++i) {
+ int fStart0 = accessor.getFieldStartOffset(tupleIndex, inputKeys[i]);
+ int fEnd0 = accessor.getFieldEndOffset(tupleIndex, inputKeys[i]);
+ int fLen0 = fEnd0 - fStart0;
+
+ int fStart1 = hashAccessor.getFieldStartOffset(hashTupleIndex, internalKeys[i]);
+ int fEnd1 = hashAccessor.getFieldEndOffset(hashTupleIndex, internalKeys[i]);
+ int fLen1 = fEnd1 - fStart1;
+
+ int c = comparators[i].compare(accessor.getBuffer().array(), fStart0 + fStartOffset0, fLen0, hashAccessor
+ .getBuffer().array(), fStart1 + fStartOffset1, fLen1);
+ if (c != 0) {
+ return c;
+ }
+ }
+ return 0;
+ }
+
+ /**
+ * Get the header frame index of the given hash table entry
+ *
+ * @param entry
+ * @return
+ */
+ private int getHeaderFrameIndex(int entry) {
+ int frameIndex = (entry / frameSize * 2 * INT_SIZE) + (entry % frameSize * 2 * INT_SIZE / frameSize);
+ return frameIndex;
+ }
+
+ /**
+ * Get the tuple index of the given hash table entry
+ *
+ * @param entry
+ * @return
+ */
+ private int getHeaderTupleIndex(int entry) {
+ int offset = (entry % frameSize) * 2 * INT_SIZE % frameSize;
+ return offset;
+ }
+
+ /**
+ * reset the header page.
+ *
+ * @param headerFrameIndex
+ */
+ private void resetHeader(int headerFrameIndex) {
+ for (int i = 0; i < frameSize; i += INT_SIZE) {
+ headers[headerFrameIndex].putInt(i, HEADER_REF_EMPTY);
+ }
+ }
+
+ public List<Integer> getSpilledRunsSizeInRawTuples() throws HyracksDataException {
+ return spilledPartRunSizesInTuples;
+ }
+
+ public int getHashedUniqueKeys() throws HyracksDataException {
+ return hashedKeys;
+ }
+
+ public int getHashedRawRecords() throws HyracksDataException {
+ return hashedRawRecords;
+ }
+
+ public List<Integer> getSpilledRunsAggregatedPages() throws HyracksDataException {
+ return spilledRunAggregatedPages;
+ }
+
+ public List<IFrameReader> getSpilledRuns() throws HyracksDataException {
+ return spilledPartRunReaders;
+ }
+
+ public List<Integer> getSpilledRunsSizeInPages() throws HyracksDataException {
+ return spilledPartRunSizesInFrames;
+ }
+
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashGroupOperatorDescriptor.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashGroupOperatorDescriptor.java
new file mode 100644
index 0000000..118ca75
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashGroupOperatorDescriptor.java
@@ -0,0 +1,399 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hybridhash;
+
+import java.nio.ByteBuffer;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import edu.uci.ics.hyracks.api.comm.IFrameReader;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFamily;
+import edu.uci.ics.hyracks.api.dataflow.value.INormalizedKeyComputerFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.ITuplePartitionComputerFamily;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFamily;
+import edu.uci.ics.hyracks.dataflow.common.io.RunFileReader;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
+import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptorFactory;
+import edu.uci.ics.hyracks.dataflow.std.group.hashsort.HybridHashSortGroupHashTable;
+import edu.uci.ics.hyracks.dataflow.std.group.hashsort.HybridHashSortRunMerger;
+
+public class HybridHashGroupOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final double HYBRID_FALLBACK_THRESHOLD = 0.8;
+
+ // merge with fudge factor
+ private static final double ESTIMATOR_MAGNIFIER = 1.2;
+
+ // input key fields
+ private final int[] keyFields;
+
+ // intermediate and final key fields
+ private final int[] storedKeyFields;
+
+ /**
+ * Input sizes as the count of the raw records.
+ */
+ private final long inputSizeInRawRecords;
+
+ /**
+ * Input size as the count of the unique keys.
+ */
+ private final long inputSizeInUniqueKeys;
+
+ // hash table size
+ private final int tableSize;
+
+ // estimated record size: used for compute the fudge factor
+ private final int userProvidedRecordSizeInBytes;
+
+ // aggregator
+ private final IAggregatorDescriptorFactory aggregatorFactory;
+
+ // merger, in case of falling back to the hash-sort algorithm for hash skewness
+ private final IAggregatorDescriptorFactory mergerFactory;
+
+ // for the sort fall-back algorithm
+ private final INormalizedKeyComputerFactory firstNormalizerFactory;
+
+ // total memory in pages
+ private final int framesLimit;
+
+ // comparator factories for key fields.
+ private final IBinaryComparatorFactory[] comparatorFactories;
+
+ /**
+ * hash families for each field: a hash function family is need as we may have
+ * more than one levels of hashing
+ */
+ private final IBinaryHashFunctionFamily[] hashFamilies;
+
+ /**
+ * Flag for input adjustment
+ */
+ private final boolean doInputAdjustment;
+
+ private final static double FUDGE_FACTOR_ESTIMATION = 1.2;
+
+ public HybridHashGroupOperatorDescriptor(JobSpecification spec, int[] keyFields, int framesLimit,
+ long inputSizeInRawRecords, long inputSizeInUniqueKeys, int recordSizeInBytes, int tableSize,
+ IBinaryComparatorFactory[] comparatorFactories, IBinaryHashFunctionFamily[] hashFamilies,
+ int hashFuncStartLevel, INormalizedKeyComputerFactory firstNormalizerFactory,
+ IAggregatorDescriptorFactory aggregatorFactory, IAggregatorDescriptorFactory mergerFactory,
+ RecordDescriptor outRecDesc) throws HyracksDataException {
+ this(spec, keyFields, framesLimit, inputSizeInRawRecords, inputSizeInUniqueKeys, recordSizeInBytes, tableSize,
+ comparatorFactories, hashFamilies, hashFuncStartLevel, firstNormalizerFactory, aggregatorFactory,
+ mergerFactory, outRecDesc, true);
+ }
+
+ public HybridHashGroupOperatorDescriptor(JobSpecification spec, int[] keyFields, int framesLimit,
+ long inputSizeInRawRecords, long inputSizeInUniqueKeys, int recordSizeInBytes, int tableSize,
+ IBinaryComparatorFactory[] comparatorFactories, IBinaryHashFunctionFamily[] hashFamilies,
+ int hashFuncStartLevel, INormalizedKeyComputerFactory firstNormalizerFactory,
+ IAggregatorDescriptorFactory aggregatorFactory, IAggregatorDescriptorFactory mergerFactory,
+ RecordDescriptor outRecDesc, boolean doInputAdjustment) throws HyracksDataException {
+ super(spec, 1, 1);
+ this.framesLimit = framesLimit;
+ this.tableSize = tableSize;
+ this.userProvidedRecordSizeInBytes = recordSizeInBytes;
+
+ this.inputSizeInRawRecords = inputSizeInRawRecords;
+ this.inputSizeInUniqueKeys = inputSizeInUniqueKeys;
+
+ if (framesLimit <= 3) {
+ // at least 3 frames: 2 for in-memory hash table, and 1 for output buffer
+ throw new HyracksDataException(
+ "Not enough memory for Hash-Hash Aggregation algorithm: at least 3 frames are necessary, but only "
+ + framesLimit + " available.");
+ }
+
+ this.keyFields = keyFields;
+ storedKeyFields = new int[keyFields.length];
+ for (int i = 0; i < storedKeyFields.length; i++) {
+ storedKeyFields[i] = i;
+ }
+
+ this.aggregatorFactory = aggregatorFactory;
+
+ this.mergerFactory = mergerFactory;
+ this.firstNormalizerFactory = firstNormalizerFactory;
+
+ this.comparatorFactories = comparatorFactories;
+
+ this.hashFamilies = hashFamilies;
+
+ recordDescriptors[0] = outRecDesc;
+
+ this.doInputAdjustment = doInputAdjustment;
+ }
+
+ @Override
+ public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
+ IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException {
+
+ final IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length];
+ for (int i = 0; i < comparators.length; i++) {
+ comparators[i] = comparatorFactories[i].createBinaryComparator();
+ }
+
+ final RecordDescriptor inRecDesc = recordDescProvider.getInputRecordDescriptor(getActivityId(), 0);
+
+ final int frameSize = ctx.getFrameSize();
+
+ final double fudgeFactor = HybridHashGroupHashTable.getHashtableOverheadRatio(tableSize, frameSize,
+ framesLimit, userProvidedRecordSizeInBytes) * FUDGE_FACTOR_ESTIMATION;
+
+ return new AbstractUnaryInputUnaryOutputOperatorNodePushable() {
+
+ HybridHashGroupHashTable topProcessor;
+
+ int observedInputSizeInRawTuples;
+ int observedInputSizeInFrames, maxRecursiveLevels;
+
+ int userProvidedInputSizeInFrames;
+
+ boolean topLevelFallbackCheck = true;
+
+ ITuplePartitionComputerFamily tpcf = new FieldHashPartitionComputerFamily(keyFields, hashFamilies);
+
+ ITuplePartitionComputerFamily tpcfMerge = new FieldHashPartitionComputerFamily(storedKeyFields,
+ hashFamilies);
+
+ ByteBuffer readAheadBuf;
+
+ /**
+ * Compute the partition numbers using hybrid-hash formula.
+ *
+ * @param tableSize
+ * @param framesLimit
+ * @param inputKeySize
+ * @param partitionInOperator
+ * @param factor
+ * @return
+ */
+ private int getNumberOfPartitions(int tableSize, int framesLimit, long inputKeySize, double factor) {
+
+ int hashtableHeaderPages = HybridHashGroupHashTable.getHeaderPages(tableSize, frameSize);
+
+ int numberOfPartitions = HybridHashUtil.hybridHashPartitionComputer((int) Math.ceil(inputKeySize),
+ framesLimit, factor);
+
+ // if the partition number is more than the available hash table contents, do pure partition.
+ if (numberOfPartitions >= framesLimit - hashtableHeaderPages) {
+ numberOfPartitions = framesLimit;
+ }
+
+ if (numberOfPartitions <= 0) {
+ numberOfPartitions = 1;
+ }
+
+ return numberOfPartitions;
+ }
+
+ @Override
+ public void open() throws HyracksDataException {
+
+ observedInputSizeInFrames = 0;
+
+ // estimate the number of unique keys for this partition, given the total raw record count and unique record count
+ long estimatedNumberOfUniqueKeys = HybridHashUtil.getEstimatedPartitionSizeOfUniqueKeys(
+ inputSizeInRawRecords, inputSizeInUniqueKeys, 1);
+
+ userProvidedInputSizeInFrames = (int) Math.ceil(estimatedNumberOfUniqueKeys
+ * userProvidedRecordSizeInBytes / frameSize);
+
+ int topPartitions = getNumberOfPartitions(tableSize, framesLimit,
+ (int) Math.ceil(userProvidedInputSizeInFrames * ESTIMATOR_MAGNIFIER), fudgeFactor);
+
+ topProcessor = new HybridHashGroupHashTable(ctx, framesLimit, tableSize, topPartitions, keyFields, 0,
+ comparators, tpcf, aggregatorFactory.createAggregator(ctx, inRecDesc, recordDescriptors[0],
+ keyFields, storedKeyFields), inRecDesc, recordDescriptors[0], writer);
+
+ writer.open();
+ topProcessor.open();
+ }
+
+ @Override
+ public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
+ observedInputSizeInRawTuples += buffer.getInt(buffer.capacity() - 4);
+ observedInputSizeInFrames++;
+ topProcessor.nextFrame(buffer);
+ }
+
+ @Override
+ public void fail() throws HyracksDataException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void close() throws HyracksDataException {
+ // estimate the maximum recursive levels
+ maxRecursiveLevels = (int) Math.max(
+ Math.ceil(Math.log(observedInputSizeInFrames * fudgeFactor) / Math.log(framesLimit)) + 1, 1);
+
+ finishAndRecursion(topProcessor, observedInputSizeInRawTuples, inputSizeInUniqueKeys, 0,
+ topLevelFallbackCheck);
+
+ writer.close();
+
+ }
+
+ private void processRunFiles(IFrameReader runReader, int inputCardinality, int runLevel)
+ throws HyracksDataException {
+
+ boolean checkFallback = true;
+
+ int numOfPartitions = getNumberOfPartitions(tableSize, framesLimit, (long)inputCardinality
+ * userProvidedRecordSizeInBytes / frameSize, fudgeFactor);
+
+ HybridHashGroupHashTable processor = new HybridHashGroupHashTable(ctx, framesLimit, tableSize,
+ numOfPartitions, keyFields, runLevel, comparators, tpcf, aggregatorFactory.createAggregator(
+ ctx, inRecDesc, recordDescriptors[0], keyFields, storedKeyFields), inRecDesc,
+ recordDescriptors[0], writer);
+
+ processor.open();
+
+ runReader.open();
+
+ int inputRunRawSizeInTuples = 0;
+
+ if (readAheadBuf == null) {
+ readAheadBuf = ctx.allocateFrame();
+ }
+ while (runReader.nextFrame(readAheadBuf)) {
+ inputRunRawSizeInTuples += readAheadBuf.getInt(readAheadBuf.capacity() - 4);
+ processor.nextFrame(readAheadBuf);
+ }
+
+ runReader.close();
+
+ finishAndRecursion(processor, inputRunRawSizeInTuples, inputCardinality, runLevel, checkFallback);
+ }
+
+ /**
+ * Finish the hash table processing and start recursive processing on run files.
+ *
+ * @param ht
+ * @param inputRawRecordCount
+ * @param inputCardinality
+ * @param level
+ * @param checkFallback
+ * @throws HyracksDataException
+ */
+ private void finishAndRecursion(HybridHashGroupHashTable ht, long inputRawRecordCount,
+ long inputCardinality, int level, boolean checkFallback) throws HyracksDataException {
+
+ ht.finishup();
+
+ List<IFrameReader> generatedRunReaders = ht.getSpilledRuns();
+ List<Integer> partitionRawRecords = ht.getSpilledRunsSizeInRawTuples();
+
+ int directFlushKeysInTuples = ht.getHashedUniqueKeys();
+ int directFlushRawRecordsInTuples = ht.getHashedRawRecords();
+
+ ht.close();
+ ht = null;
+
+ ctx.getCounterContext().getCounter("optional.levels." + level + ".estiInputKeyCardinality", true)
+ .update(inputCardinality);
+
+ // do adjustment
+ if (doInputAdjustment && directFlushRawRecordsInTuples > 0) {
+ inputCardinality = (int) Math.ceil((double) directFlushKeysInTuples / directFlushRawRecordsInTuples
+ * inputRawRecordCount);
+ }
+
+ ctx.getCounterContext()
+ .getCounter("optional.levels." + level + ".estiInputKeyCardinalityAdjusted", true)
+ .update(inputCardinality);
+
+ IFrameReader recurRunReader;
+ int subPartitionRawRecords;
+
+ while (!generatedRunReaders.isEmpty()) {
+ recurRunReader = generatedRunReaders.remove(0);
+ subPartitionRawRecords = partitionRawRecords.remove(0);
+
+ int runKeyCardinality = (int) Math.ceil((double) inputCardinality * subPartitionRawRecords
+ / inputRawRecordCount);
+
+ if ((checkFallback && runKeyCardinality > HYBRID_FALLBACK_THRESHOLD * inputCardinality)
+ || level > maxRecursiveLevels) {
+ Logger.getLogger(HybridHashGroupOperatorDescriptor.class.getSimpleName()).warning(
+ "Hybrid-hash falls back to hash-sort algorithm! (" + level + ":" + maxRecursiveLevels
+ + ")");
+ fallback(recurRunReader, level);
+ } else {
+ processRunFiles(recurRunReader, runKeyCardinality, level + 1);
+ }
+
+ }
+ }
+
+ private void fallback(IFrameReader recurRunReader, int runLevel) throws HyracksDataException {
+ // fall back
+ FrameTupleAccessor runFrameTupleAccessor = new FrameTupleAccessor(frameSize, inRecDesc);
+ HybridHashSortGroupHashTable hhsTable = new HybridHashSortGroupHashTable(ctx, framesLimit, tableSize,
+ keyFields, comparators, tpcf.createPartitioner(runLevel + 1),
+ firstNormalizerFactory.createNormalizedKeyComputer(), aggregatorFactory.createAggregator(ctx,
+ inRecDesc, recordDescriptors[0], keyFields, storedKeyFields), inRecDesc,
+ recordDescriptors[0]);
+
+ recurRunReader.open();
+ if (readAheadBuf == null) {
+ readAheadBuf = ctx.allocateFrame();
+ }
+ while (recurRunReader.nextFrame(readAheadBuf)) {
+ runFrameTupleAccessor.reset(readAheadBuf);
+ int tupleCount = runFrameTupleAccessor.getTupleCount();
+ for (int j = 0; j < tupleCount; j++) {
+ hhsTable.insert(runFrameTupleAccessor, j);
+ }
+ }
+
+ recurRunReader.close();
+ hhsTable.finishup();
+
+ LinkedList<RunFileReader> hhsRuns = hhsTable.getRunFileReaders();
+
+ if (hhsRuns.isEmpty()) {
+ hhsTable.flushHashtableToOutput(writer);
+ hhsTable.close();
+ } else {
+ hhsTable.close();
+ HybridHashSortRunMerger hhsMerger = new HybridHashSortRunMerger(ctx, hhsRuns, storedKeyFields,
+ comparators, recordDescriptors[0], tpcfMerge.createPartitioner(runLevel + 1),
+ mergerFactory.createAggregator(ctx, recordDescriptors[0], recordDescriptors[0],
+ storedKeyFields, storedKeyFields), framesLimit, tableSize, writer, false);
+ hhsMerger.process();
+ }
+ }
+
+ };
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashUtil.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashUtil.java
new file mode 100644
index 0000000..5323887
--- /dev/null
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/hybridhash/HybridHashUtil.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.dataflow.std.group.hybridhash;
+
+public class HybridHashUtil {
+
+ /**
+ * Compute the expected number of spilling partitions (in-memory partition is not included), using the hybrid-hash
+ * algorithm from [Shapiro86]. Note that 0 means that there is no need to have spilling partitions.
+ *
+ * @param inputSizeInFrames
+ * @param memorySizeInFrames
+ * @param fudgeFactor
+ * @return
+ */
+ public static int hybridHashPartitionComputer(int inputSizeOfUniqueKeysInFrames, int memorySizeInFrames,
+ double fudgeFactor) {
+ return Math.max(
+ (int) Math.ceil((inputSizeOfUniqueKeysInFrames * fudgeFactor - memorySizeInFrames)
+ / (memorySizeInFrames - 1)), 0);
+ }
+
+ /**
+ * Compute the estimated number of unique keys in a partition of a dataset, using Yao's formula
+ *
+ * @param inputSizeInRawRecords
+ * @param inputSizeInUniqueKeys
+ * @param numOfPartitions
+ * @return
+ */
+ public static long getEstimatedPartitionSizeOfUniqueKeys(long inputSizeInRawRecords, long inputSizeInUniqueKeys,
+ int numOfPartitions) {
+ if (numOfPartitions == 1) {
+ return inputSizeInUniqueKeys;
+ }
+ return (long) Math.ceil(inputSizeInUniqueKeys
+ * (1 - Math.pow(1 - ((double) inputSizeInRawRecords / (double) numOfPartitions)
+ / (double) inputSizeInRawRecords, (double) inputSizeInRawRecords
+ / (double) inputSizeInUniqueKeys)));
+ }
+}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/preclustered/PreclusteredGroupWriter.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/preclustered/PreclusteredGroupWriter.java
index d1fec29..fd4f8da 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/preclustered/PreclusteredGroupWriter.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/group/preclustered/PreclusteredGroupWriter.java
@@ -29,20 +29,26 @@
import edu.uci.ics.hyracks.dataflow.std.group.IAggregatorDescriptor;
public class PreclusteredGroupWriter implements IFrameWriter {
+
+ private final static int INT_SIZE = 4;
+
private final int[] groupFields;
private final IBinaryComparator[] comparators;
private final IAggregatorDescriptor aggregator;
private final AggregateState aggregateState;
private final IFrameWriter writer;
- private final ByteBuffer copyFrame;
private final FrameTupleAccessor inFrameAccessor;
- private final FrameTupleAccessor copyFrameAccessor;
private final ByteBuffer outFrame;
private final FrameTupleAppender appender;
private final ArrayTupleBuilder tupleBuilder;
- private boolean first;
+ private final RecordDescriptor outRecordDesc;
+
+ private byte[] groupResultCache;
+ private ByteBuffer groupResultCacheBuffer;
+ private FrameTupleAccessor groupResultCacheAccessor;
+ private FrameTupleAppender groupResultCacheAppender;
public PreclusteredGroupWriter(IHyracksTaskContext ctx, int[] groupFields, IBinaryComparator[] comparators,
IAggregatorDescriptor aggregator, RecordDescriptor inRecordDesc, RecordDescriptor outRecordDesc,
@@ -52,10 +58,9 @@
this.aggregator = aggregator;
this.aggregateState = aggregator.createAggregateStates();
this.writer = writer;
- copyFrame = ctx.allocateFrame();
+ this.outRecordDesc = outRecordDesc;
+
inFrameAccessor = new FrameTupleAccessor(ctx.getFrameSize(), inRecordDesc);
- copyFrameAccessor = new FrameTupleAccessor(ctx.getFrameSize(), inRecordDesc);
- copyFrameAccessor.reset(copyFrame);
outFrame = ctx.allocateFrame();
appender = new FrameTupleAppender(ctx.getFrameSize());
@@ -67,7 +72,6 @@
@Override
public void open() throws HyracksDataException {
writer.open();
- first = true;
}
@Override
@@ -75,40 +79,45 @@
inFrameAccessor.reset(buffer);
int nTuples = inFrameAccessor.getTupleCount();
for (int i = 0; i < nTuples; ++i) {
- if (first) {
- tupleBuilder.reset();
- for (int j = 0; j < groupFields.length; j++) {
- tupleBuilder.addField(inFrameAccessor, i, groupFields[j]);
- }
- aggregator.init(tupleBuilder, inFrameAccessor, i, aggregateState);
-
- first = false;
-
- } else {
- if (i == 0) {
- switchGroupIfRequired(copyFrameAccessor, copyFrameAccessor.getTupleCount() - 1, inFrameAccessor, i);
+ if (groupResultCache != null && groupResultCacheAccessor.getTupleCount() > 0) {
+ groupResultCacheAccessor.reset(ByteBuffer.wrap(groupResultCache));
+ if (sameGroup(inFrameAccessor, i, groupResultCacheAccessor, 0)) {
+ // find match: do aggregation
+ aggregator.aggregate(inFrameAccessor, i, groupResultCacheAccessor, 0, aggregateState);
+ continue;
} else {
- switchGroupIfRequired(inFrameAccessor, i - 1, inFrameAccessor, i);
+ // write the cached group into the final output
+ writeOutput(groupResultCacheAccessor, 0);
}
-
}
- }
- FrameUtils.copy(buffer, copyFrame);
- }
-
- private void switchGroupIfRequired(FrameTupleAccessor prevTupleAccessor, int prevTupleIndex,
- FrameTupleAccessor currTupleAccessor, int currTupleIndex) throws HyracksDataException {
- if (!sameGroup(prevTupleAccessor, prevTupleIndex, currTupleAccessor, currTupleIndex)) {
- writeOutput(prevTupleAccessor, prevTupleIndex);
tupleBuilder.reset();
+
for (int j = 0; j < groupFields.length; j++) {
- tupleBuilder.addField(currTupleAccessor, currTupleIndex, groupFields[j]);
+ tupleBuilder.addField(inFrameAccessor, i, groupFields[j]);
}
- aggregator.init(tupleBuilder, currTupleAccessor, currTupleIndex, aggregateState);
- } else {
- aggregator.aggregate(currTupleAccessor, currTupleIndex, null, 0, aggregateState);
+
+ aggregator.init(tupleBuilder, inFrameAccessor, i, aggregateState);
+
+ // enlarge the cache buffer if necessary
+ int requiredSize = tupleBuilder.getSize() + tupleBuilder.getFieldEndOffsets().length * INT_SIZE + 2
+ * INT_SIZE;
+
+ if (groupResultCache == null || groupResultCache.length < requiredSize) {
+ groupResultCache = new byte[requiredSize];
+ groupResultCacheAppender = new FrameTupleAppender(groupResultCache.length);
+ groupResultCacheBuffer = ByteBuffer.wrap(groupResultCache);
+ groupResultCacheAccessor = new FrameTupleAccessor(groupResultCache.length, outRecordDesc);
+ }
+
+ groupResultCacheAppender.reset(groupResultCacheBuffer, true);
+ if (!groupResultCacheAppender.append(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray(), 0,
+ tupleBuilder.getSize())) {
+ throw new HyracksDataException("The partial result is too large to be initialized in a frame.");
+ }
+
+ groupResultCacheAccessor.reset(groupResultCacheBuffer);
}
}
@@ -117,7 +126,7 @@
tupleBuilder.reset();
for (int j = 0; j < groupFields.length; j++) {
- tupleBuilder.addField(lastTupleAccessor, lastTupleIndex, groupFields[j]);
+ tupleBuilder.addField(lastTupleAccessor, lastTupleIndex, j);
}
aggregator.outputFinalResult(tupleBuilder, lastTupleAccessor, lastTupleIndex, aggregateState);
@@ -138,8 +147,8 @@
int fIdx = groupFields[i];
int s1 = a1.getTupleStartOffset(t1Idx) + a1.getFieldSlotsLength() + a1.getFieldStartOffset(t1Idx, fIdx);
int l1 = a1.getFieldLength(t1Idx, fIdx);
- int s2 = a2.getTupleStartOffset(t2Idx) + a2.getFieldSlotsLength() + a2.getFieldStartOffset(t2Idx, fIdx);
- int l2 = a2.getFieldLength(t2Idx, fIdx);
+ int s2 = a2.getTupleStartOffset(t2Idx) + a2.getFieldSlotsLength() + a2.getFieldStartOffset(t2Idx, i);
+ int l2 = a2.getFieldLength(t2Idx, i);
if (comparators[i].compare(a1.getBuffer().array(), s1, l1, a2.getBuffer().array(), s2, l2) != 0) {
return false;
}
@@ -154,8 +163,8 @@
@Override
public void close() throws HyracksDataException {
- if (!first) {
- writeOutput(copyFrameAccessor, copyFrameAccessor.getTupleCount() - 1);
+ if (groupResultCache != null && groupResultCacheAccessor.getTupleCount() > 0) {
+ writeOutput(groupResultCacheAccessor, 0);
if (appender.getTupleCount() > 0) {
FrameUtils.flushFrame(outFrame, writer);
}
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/misc/PrinterOperatorDescriptor.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/misc/PrinterOperatorDescriptor.java
index 47a8616..0af2dd8 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/misc/PrinterOperatorDescriptor.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/misc/PrinterOperatorDescriptor.java
@@ -48,10 +48,10 @@
@Override
public void writeData(Object[] data) throws HyracksDataException {
for (int i = 0; i < data.length; ++i) {
- System.err.print(StringSerializationUtils.toString(data[i]));
- System.err.print(", ");
+ // System.err.print(StringSerializationUtils.toString(data[i]));
+ // System.err.print(", ");
}
- System.err.println();
+ //System.err.println();
}
@Override
diff --git a/hyracks/hyracks-examples/hadoop-compat-example/hadoopcompatserver/pom.xml b/hyracks/hyracks-examples/hadoop-compat-example/hadoopcompatserver/pom.xml
index 9a0f7cb..e545663 100644
--- a/hyracks/hyracks-examples/hadoop-compat-example/hadoopcompatserver/pom.xml
+++ b/hyracks/hyracks-examples/hadoop-compat-example/hadoopcompatserver/pom.xml
@@ -111,8 +111,9 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.0.2</version>
<configuration>
- <source>1.6</source>
- <target>1.6</target>
+ <source>1.7</source>
+ <target>1.7</target>
+ <fork>true</fork>
</configuration>
</plugin>
<plugin>
diff --git a/hyracks/hyracks-examples/text-example/textserver/pom.xml b/hyracks/hyracks-examples/text-example/textserver/pom.xml
index 1daca96..0ee6e0a 100644
--- a/hyracks/hyracks-examples/text-example/textserver/pom.xml
+++ b/hyracks/hyracks-examples/text-example/textserver/pom.xml
@@ -110,8 +110,9 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.0.2</version>
<configuration>
- <source>1.6</source>
- <target>1.6</target>
+ <source>1.7</source>
+ <target>1.7</target>
+ <fork>true</fork>
</configuration>
</plugin>
<plugin>
diff --git a/hyracks/hyracks-hdfs/hyracks-hdfs-core/pom.xml b/hyracks/hyracks-hdfs/hyracks-hdfs-core/pom.xml
index 87875f9..bec0d34 100644
--- a/hyracks/hyracks-hdfs/hyracks-hdfs-core/pom.xml
+++ b/hyracks/hyracks-hdfs/hyracks-hdfs-core/pom.xml
@@ -1,5 +1,6 @@
<?xml version="1.0"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>hyracks-hdfs-core</artifactId>
<name>hyracks-hdfs-core</name>
@@ -52,6 +53,18 @@
</filesets>
</configuration>
</plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <version>2.2</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
</plugins>
</build>
diff --git a/pom.xml b/pom.xml
index 3a7d87b..e63a751 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,4 +1,4 @@
-
+<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.uci.ics.hyracks</groupId>
@@ -68,5 +68,6 @@
<module>algebricks</module>
<module>pregelix</module>
<module>hivesterix</module>
+ <module>genomix</module>
</modules>
</project>
diff --git a/pregelix/pregelix-core/src/main/java/edu/uci/ics/pregelix/core/util/PregelixHyracksIntegrationUtil.java b/pregelix/pregelix-core/src/main/java/edu/uci/ics/pregelix/core/util/PregelixHyracksIntegrationUtil.java
index d099645..b761f62 100644
--- a/pregelix/pregelix-core/src/main/java/edu/uci/ics/pregelix/core/util/PregelixHyracksIntegrationUtil.java
+++ b/pregelix/pregelix-core/src/main/java/edu/uci/ics/pregelix/core/util/PregelixHyracksIntegrationUtil.java
@@ -45,7 +45,7 @@
private static NodeControllerService nc2;
private static IHyracksClientConnection hcc;
- public static void init() throws Exception {
+ public static void init(String topologyFilePath) throws Exception {
CCConfig ccConfig = new CCConfig();
ccConfig.clientNetIpAddress = CC_HOST;
ccConfig.clusterNetIpAddress = CC_HOST;
diff --git a/pregelix/pregelix-core/src/test/java/edu/uci/ics/pregelix/core/join/JoinTest.java b/pregelix/pregelix-core/src/test/java/edu/uci/ics/pregelix/core/join/JoinTest.java
index 3c00cad..a343cae 100644
--- a/pregelix/pregelix-core/src/test/java/edu/uci/ics/pregelix/core/join/JoinTest.java
+++ b/pregelix/pregelix-core/src/test/java/edu/uci/ics/pregelix/core/join/JoinTest.java
@@ -102,7 +102,7 @@
ClusterConfig.setStorePath(PATH_TO_CLUSTER_STORE);
ClusterConfig.setClusterPropertiesPath(PATH_TO_CLUSTER_PROPERTIES);
cleanupStores();
- PregelixHyracksIntegrationUtil.init();
+ PregelixHyracksIntegrationUtil.init("src/test/resources/topology.xml");
FileUtils.forceMkdir(new File(EXPECT_RESULT_DIR));
FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
diff --git a/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/dataload/DataLoadTest.java b/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/dataload/DataLoadTest.java
index da6d564..d6d9fe3 100644
--- a/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/dataload/DataLoadTest.java
+++ b/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/dataload/DataLoadTest.java
@@ -76,7 +76,7 @@
ClusterConfig.setStorePath(PATH_TO_CLUSTER_STORE);
ClusterConfig.setClusterPropertiesPath(PATH_TO_CLUSTER_PROPERTIES);
cleanupStores();
- PregelixHyracksIntegrationUtil.init();
+ PregelixHyracksIntegrationUtil.init("src/test/resources/topology.xml");
LOGGER.info("Hyracks mini-cluster started");
startHDFS();
FileUtils.forceMkdir(new File(EXPECT_RESULT_DIR));
diff --git a/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/jobrun/RunJobTestSuite.java b/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/jobrun/RunJobTestSuite.java
index 0a444fa..14d7718 100644
--- a/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/jobrun/RunJobTestSuite.java
+++ b/pregelix/pregelix-example/src/test/java/edu/uci/ics/pregelix/example/jobrun/RunJobTestSuite.java
@@ -77,7 +77,7 @@
ClusterConfig.setStorePath(PATH_TO_CLUSTER_STORE);
ClusterConfig.setClusterPropertiesPath(PATH_TO_CLUSTER_PROPERTIES);
cleanupStores();
- PregelixHyracksIntegrationUtil.init();
+ PregelixHyracksIntegrationUtil.init("src/test/resources/topology.xml");
LOGGER.info("Hyracks mini-cluster started");
FileUtils.forceMkdir(new File(ACTUAL_RESULT_DIR));
FileUtils.cleanDirectory(new File(ACTUAL_RESULT_DIR));
diff --git a/pregelix/pregelix-runtime/src/main/java/edu/uci/ics/pregelix/runtime/function/ComputeUpdateFunctionFactory.java b/pregelix/pregelix-runtime/src/main/java/edu/uci/ics/pregelix/runtime/function/ComputeUpdateFunctionFactory.java
index 0a0a14f..b6df213 100644
--- a/pregelix/pregelix-runtime/src/main/java/edu/uci/ics/pregelix/runtime/function/ComputeUpdateFunctionFactory.java
+++ b/pregelix/pregelix-runtime/src/main/java/edu/uci/ics/pregelix/runtime/function/ComputeUpdateFunctionFactory.java
@@ -183,6 +183,7 @@
}
try {
+ vertex.activate();
vertex.compute(msgIterator);
vertex.finishCompute();
} catch (IOException e) {