[NO ISSUE] Avoid overwriting files in SparkSQL perf test
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
1. Overwritting file on HDFS sometimes causes uploading issue. Now we
generate different scripts for cc and ncs.
2. Fix query issues in SparkSQL test.
3. Add commented disk option.
Change-Id: Ia0e04b3c80cc83322def2a949fc0ddf01fd8e7a8
Reviewed-on: https://asterix-gerrit.ics.uci.edu/2011
Sonar-Qube: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Contrib: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Till Westmann <tillw@apache.org>
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml
index 4529e8b..9fc4c9a 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml
@@ -50,6 +50,12 @@
Name: "{{ spark_instance_name }}"
aws_access_key: "{{ access_key_id }}"
aws_secret_key: "{{ secret_access_key }}"
+ # Enable the volume section if you need extra disk space
+ #volumes:
+ # - device_name: /dev/xvda
+ # volume_type: gp2
+ # volume_size: 50
+ # delete_on_termination: true
register:
ec2
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml
index 267ab2a..155515f 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml
@@ -23,7 +23,7 @@
- include_vars: spark_sql_settings.yml
- name: Download Hadoop
get_url:
- url: http://apache.mirrors.hoobly.com/hadoop/common/hadoop-2.8.0/hadoop-2.8.0.tar.gz
+ url: https://archive.apache.org/dist/hadoop/core/hadoop-2.8.0/hadoop-2.8.0.tar.gz
dest: "{{ home_dir }}/hadoop.tar.gz"
- name: Unzip Hadoop
unarchive:
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml
index 5b07eed..651d166 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml
@@ -24,10 +24,24 @@
- name: Create TPCH data dir on HDFS
shell: 'bash {{ hadoop_base }}/bin/hdfs dfs -mkdir -p {{ data_dir }}'
-- hosts: ncs
+- hosts: ncs[0]
tasks:
- include_vars: spark_sql_settings.yml
- include_vars: "{{ playbook_dir }}/../conf/private_ip.yml"
- include_vars: ../../../benchmarks/tpch/gen/settings.yml
- - name: Put data on to HDFS
- shell: for i in `ls -rS {{ data_dir }}/`; do {{ hadoop_base }}/bin/hdfs dfs -put -f {{ data_dir }}/$i hdfs://{{ cc_ip }}:9000/{{ data_dir }}; rm {{ data_dir }}/$i; done
\ No newline at end of file
+ - name: Populate data uploading script for 1st node
+ shell: echo "for i in \`ls -rS {{ data_dir }}/*.tbl*\`; do {{ hadoop_base }}/bin/hdfs dfs -put -f \$i hdfs://{{ cc_ip }}:9000/{{ data_dir }}; done" > "{{ upload_script }}"
+
+
+- hosts: ncs[1:]
+ tasks:
+ - include_vars: spark_sql_settings.yml
+ - include_vars: "{{ playbook_dir }}/../conf/private_ip.yml"
+ - include_vars: ../../../benchmarks/tpch/gen/settings.yml
+ - name: Put data uploading generation script for the rest
+ shell: echo "for i in \`ls -rS {{ data_dir }}/*.tbl.*\`; do {{ hadoop_base }}/bin/hdfs dfs -put -f \$i hdfs://{{ cc_ip }}:9000/{{ data_dir }}; done" > "{{ upload_script }}"
+
+- hosts: ncs
+ tasks:
+ - name: Upload data
+ shell: sh {{ upload_script }}
\ No newline at end of file
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml
index 1ff3f67..da13d3d 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml
@@ -42,3 +42,5 @@
binary_dir: "{{ home_dir }}/{{ generator }}"
local_result: "/tmp/sparkSQL_Result.txt"
+
+upload_script: "{{ home_dir }}/upload.sh"
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql
index e45fea1..f40cd2c 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql
@@ -20,7 +20,7 @@
WITH Q20_TMP1 AS
(SELECT DISTINCT P_PARTKEY
FROM PART
- WHERE P_NAME LIKE "FOREST%"),
+ WHERE P_NAME LIKE "forest%"),
Q20_TMP2 AS
(SELECT L_PARTKEY,
L_SUPPKEY,
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql
index db73e62..49cb859 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql
@@ -56,7 +56,7 @@
FROM NATION N
JOIN SUPPLIER S ON N.N_NATIONKEY = S.S_NATIONKEY) S1
JOIN LINEITEM L ON S1.S_SUPPKEY = L.L_SUPPKEY) L1 ON PS.PS_SUPPKEY = L1.L_SUPPKEY
- AND PS.S_PARTKEY = L1.L_PARTKEY) L2 ON P.P_NAME LIKE "%GREEN%"
+ AND PS.S_PARTKEY = L1.L_PARTKEY) L2 ON P.P_NAME LIKE "%green%"
AND P.P_PARTKEY = L2.L_PARTKEY) L3 ON O.O_ORDERKEY = L3.L_ORDERKEY) PROFIT
GROUP BY NATION,
O_YEAR
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl
index cf5bc36..4762135 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl
@@ -72,6 +72,7 @@
spark.sqlContext.cacheTable("LINEITEM")
// Execute Query
+val writer0 = new PrintWriter(new File("{{home_dir}}/detail.txt"))
val queries_dir = new File(queries_root)
val etime = collection.mutable.Map[String, Float]()
for (i <- 0 to round) {
@@ -82,7 +83,7 @@
val t0 = System.nanoTime()
var query = ""
queries.getLines.foreach { line => query += (line + "\n")}
- spark.sql(query).count()
+ spark.sql(query).collect().foreach(println)
val t1 = System.nanoTime()
val elapsed = (t1 - t0) / 1000000000.0f
if (i > 0) {
@@ -91,8 +92,12 @@
}
etime(file_name) += elapsed
}
+ writer0.print(file_name + " " + elapsed + " ")
+ writer0.flush()
}
+ writer0.print("\n")
}
+writer0.close()
// Write result
val writer = new PrintWriter(new File("{{ result_file }}"))
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql
index 4590598..c90dd7c 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql
@@ -23,7 +23,7 @@
FROM (
SELECT C.C_CUSTKEY, COUNT(O.O_ORDERKEY) AS O_ORDERKEY_COUNT
FROM (CUSTOMER C LEFT OUTER JOIN ORDERS O)
- WHERE C.C_CUSTKEY = O.O_CUSTKEY AND O.O_COMMENT NOT LIKE "%SPECIAL%REQUESTS%"
+ WHERE C.C_CUSTKEY = O.O_CUSTKEY AND O.O_COMMENT NOT LIKE "%special%requests%"
GROUP BY C.C_CUSTKEY
) CO
GROUP BY C_CUSTKEY
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql
index aea9188..b0686a4 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql
@@ -30,11 +30,11 @@
FROM PARTSUPP PS,
PART P
WHERE P.P_PARTKEY = PS.S_PARTKEY
- AND P.P_BRAND != "BRAND#45"
+ AND P.P_BRAND != "Brand#45"
AND P.P_TYPE NOT LIKE "MEDIUM POLISHED%") AS PSP,
SUPPLIER S
WHERE PSP.PS_SUPPKEY = S.S_SUPPKEY
- AND S.S_COMMENT NOT LIKE "%CUSTOMER%COMPLAINTS%")
+ AND S.S_COMMENT NOT LIKE "%Customer%Complaints%")
SELECT P_BRAND,
P_TYPE,
P_SIZE,
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql
index c3894fe..b51e530 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql
@@ -22,12 +22,12 @@
0.2 * AVG(L_QUANTITY) T_AVG_QUANTITY
FROM LINEITEM
GROUP BY L_PARTKEY)
-SELECT *
+SELECT SUM(L.L_EXTENDEDPRICE) / 7.0
FROM tmp T,
LINEITEM L,
PART P
WHERE P.P_PARTKEY = L.L_PARTKEY
AND P.P_CONTAINER = "MED BOX"
- AND P.P_BRAND = "BRAND#23"
+ AND P.P_BRAND = "Brand#23"
AND L.L_PARTKEY = T.T_PARTKEY
AND L.L_QUANTITY < T.T_AVG_QUANTITY
\ No newline at end of file
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql
index 5cacbdb..01f8692 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql
@@ -29,19 +29,19 @@
SELECT SUM(L.EXTNDPRICE * (1 - L.DISCOUNT))
FROM tmp L
JOIN PART P ON P.P_PARTKEY = L.LPKEY
-WHERE (P.P_BRAND = "BRAND#12"
+WHERE (P.P_BRAND = "Brand#12"
AND P.P_CONTAINER REGEXP "SM CASE|SM BOX|SM PACK|SM PKG"
AND L.QUANTITY >= 1
AND L.QUANTITY <= 11
AND P.P_SIZE >= 1
AND P.P_SIZE <= 5)
- OR (P.P_BRAND = "BRAND#23"
+ OR (P.P_BRAND = "Brand#23"
AND P.P_CONTAINER REGEXP "MED BAG|MED BOX|MED PKG|MED PACK"
AND L.QUANTITY >= 10
AND L.QUANTITY <= 20
AND P.P_SIZE >= 1
AND P.P_SIZE <= 10)
- OR (P.P_BRAND = "BRAND#34"
+ OR (P.P_BRAND = "Brand#34"
AND P.P_CONTAINER REGEXP "LG CASE|LG BOX|LG PACK|LG PKG"
AND L.QUANTITY >= 20
AND L.QUANTITY <= 30
diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql
index aadbb55..a932192 100644
--- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql
+++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql
@@ -17,19 +17,18 @@
-- under the License.
-- ------------------------------------------------------------
-SELECT l.L_RETURNFLAG,
- l.L_LINESTATUS,
- sum(l.L_QUANTITY) AS sum_qty,
- sum(l.L_EXTENDEDPRICE) AS sum_base_price,
- sum(l.L_EXTENDEDPRICE * (1 - l.L_DISCOUNT)) AS sum_disc_price,
- sum(l.L_EXTENDEDPRICE * (1 - l.L_DISCOUNT) * (1 + l.L_TAX)) AS sum_charge,
- avg(l.l_quantity) AS ave_qty,
- avg(l.L_EXTENDEDPRICE) AS ave_price,
- avg(l.L_DISCOUNT) AS ave_disc,
- count(*) AS count_order
-FROM LINEITEM AS l
-WHERE l.L_SHIPDATE <= "1998-09-02"
-GROUP BY l.L_RETURNFLAG,
- l.L_LINESTATUS
-ORDER BY l.L_RETURNFLAG,
- l.L_LINESTATUS
\ No newline at end of file
+SELECT l.L_ORDERKEY,
+ sum(l.L_EXTENDEDPRICE * (1 - l.L_DISCOUNT)) AS REVENUE,
+ o.O_ORDERDATE,
+ o.O_SHIPPRIORITY
+FROM CUSTOMER AS c,
+ ORDERS AS o,
+ LINEITEM AS l
+where c.C_MKTSEGMENT = 'BUILDING'
+ AND c.C_CUSTKEY = o.O_CUSTKEY
+ AND l.L_ORDERKEY = o.O_ORDERKEY
+ AND o.O_ORDERDATE < '1995-03-15'
+ AND l.L_SHIPDATE > '1995-03-15'
+GROUP BY l.L_ORDERKEY, o.O_ORDERDATE, o.O_SHIPPRIORITY
+ORDER BY REVENUE DESC,O_ORDERDATE
+LIMIT 10
\ No newline at end of file