From 29153f32a3e5070d0014f1b7350b1819a8c1bd91 Mon Sep 17 00:00:00 2001 From: Joseph Allemandou Date: Thu, 30 May 2024 18:50:14 +0200 Subject: [PATCH] Revert analytics webrequest hive jar change The new version generates an NPE due to maxmind refactoring. Reopening T365197 Bug:T365197 --- .../webrequest/refine_webrequest_frontend_hourly_dag_factory.py | 2 +- .../dags/webrequest/refine_webrequest_hourly_dag_factory.py | 2 +- ...e_webrequest_frontend_hourly_text-refine_webrequest.expected | 2 +- ...webrequest_frontend_hourly_upload-refine_webrequest.expected | 2 +- ....py-refine_webrequest_hourly_text-refine_webrequest.expected | 2 +- ...y-refine_webrequest_hourly_upload-refine_webrequest.expected | 2 +- ...efine_webrequest_hourly_test_text-refine_webrequest.expected | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/analytics/dags/webrequest/refine_webrequest_frontend_hourly_dag_factory.py b/analytics/dags/webrequest/refine_webrequest_frontend_hourly_dag_factory.py index 791b73c45..a0ff5b15b 100644 --- a/analytics/dags/webrequest/refine_webrequest_frontend_hourly_dag_factory.py +++ b/analytics/dags/webrequest/refine_webrequest_frontend_hourly_dag_factory.py @@ -292,7 +292,7 @@ def generate_dag( task_id="refine_webrequest", sql="https://gitlab.wikimedia.org/-/snippets/124/raw/main/refine_webrequest_hourly.hql", # noqa: W505, E501 query_parameters={ - "refinery_jar": artifact("refinery-hive-0.2.41-shaded.jar"), + "refinery_jar": artifact("refinery-hive-0.2.30-shaded.jar"), "source_table": raw_webrequest_table, "webrequest_source": webrequest_source, "destination_table": refined_webrequest_table, diff --git a/analytics/dags/webrequest/refine_webrequest_hourly_dag_factory.py b/analytics/dags/webrequest/refine_webrequest_hourly_dag_factory.py index 1ca3cd580..2de02af35 100644 --- a/analytics/dags/webrequest/refine_webrequest_hourly_dag_factory.py +++ b/analytics/dags/webrequest/refine_webrequest_hourly_dag_factory.py @@ -334,7 +334,7 @@ def generate_dag( task_id="refine_webrequest", sql=f"{hql_directory}/webrequest/refine_webrequest_hourly.hql", query_parameters={ - "refinery_jar": artifact("refinery-hive-0.2.41-shaded.jar"), + "refinery_jar": artifact("refinery-hive-0.2.30-shaded.jar"), "source_table": raw_webrequest_table, "webrequest_source": webrequest_source, "destination_table": refined_webrequest_table, diff --git a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_text-refine_webrequest.expected b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_text-refine_webrequest.expected index 1661684fe..55c53b2d8 100644 --- a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_text-refine_webrequest.expected +++ b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_text-refine_webrequest.expected @@ -31,7 +31,7 @@ master: --class org.apache.spark.sql.hive.thriftserver.WMFSparkSQLCLIDriver --queue production --deploy-mode client hdfs:///wmf/cache/artifacts/airflow/analytics/wmf-sparksqlclidriver-1.0.0.jar -f https://gitlab.wikimedia.org/-/snippets/124/raw/main/refine_webrequest_hourly.hql - -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.41-shaded.jar + -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.30-shaded.jar -d source_table=gmodena.webrequest_frontend_rc0 -d webrequest_source=text -d destination_table=gmodena.webrequest -d year=2024 -d month=4 -d day=15 -d hour=15 -d record_version=0.0.27 -d coalesce_partitions=256 -d spark_sql_shuffle_partitions=256 -d excluded_row_ids= diff --git a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_upload-refine_webrequest.expected b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_upload-refine_webrequest.expected index 4bfc0a4f3..5e136bdef 100644 --- a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_upload-refine_webrequest.expected +++ b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_frontend_hourly_dag.py-refine_webrequest_frontend_hourly_upload-refine_webrequest.expected @@ -31,7 +31,7 @@ master: --class org.apache.spark.sql.hive.thriftserver.WMFSparkSQLCLIDriver --queue production --deploy-mode client hdfs:///wmf/cache/artifacts/airflow/analytics/wmf-sparksqlclidriver-1.0.0.jar -f https://gitlab.wikimedia.org/-/snippets/124/raw/main/refine_webrequest_hourly.hql - -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.41-shaded.jar + -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.30-shaded.jar -d source_table=gmodena.webrequest_frontend_rc0 -d webrequest_source=upload -d destination_table=gmodena.webrequest -d year=2024 -d month=4 -d day=15 -d hour=15 -d record_version=0.0.27 -d coalesce_partitions=256 -d spark_sql_shuffle_partitions=256 diff --git a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_text-refine_webrequest.expected b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_text-refine_webrequest.expected index 3e362f3ad..c02a7ca2f 100644 --- a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_text-refine_webrequest.expected +++ b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_text-refine_webrequest.expected @@ -31,7 +31,7 @@ master: --class org.apache.spark.sql.hive.thriftserver.WMFSparkSQLCLIDriver --queue production --deploy-mode client hdfs:///wmf/cache/artifacts/airflow/analytics/wmf-sparksqlclidriver-1.0.0.jar -f hdfs://analytics-hadoop/wmf/refinery/current/hql/webrequest/refine_webrequest_hourly.hql - -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.41-shaded.jar + -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.30-shaded.jar -d source_table=wmf_raw.webrequest -d webrequest_source=text -d destination_table=wmf.webrequest -d year=2023 -d month=4 -d day=11 -d hour=15 -d record_version=0.0.27 -d coalesce_partitions=256 -d spark_sql_shuffle_partitions=256 -d excluded_row_ids= diff --git a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_upload-refine_webrequest.expected b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_upload-refine_webrequest.expected index a947eb2f0..4b03b87ba 100644 --- a/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_upload-refine_webrequest.expected +++ b/tests/analytics/fixtures/spark_skein_specs/analytics_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_upload-refine_webrequest.expected @@ -31,7 +31,7 @@ master: --class org.apache.spark.sql.hive.thriftserver.WMFSparkSQLCLIDriver --queue production --deploy-mode client hdfs:///wmf/cache/artifacts/airflow/analytics/wmf-sparksqlclidriver-1.0.0.jar -f hdfs://analytics-hadoop/wmf/refinery/current/hql/webrequest/refine_webrequest_hourly.hql - -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.41-shaded.jar + -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics/refinery-hive-0.2.30-shaded.jar -d source_table=wmf_raw.webrequest -d webrequest_source=upload -d destination_table=wmf.webrequest -d year=2023 -d month=4 -d day=11 -d hour=15 -d record_version=0.0.27 -d coalesce_partitions=256 -d spark_sql_shuffle_partitions=256 -d excluded_row_ids= diff --git a/tests/analytics_test/fixtures/spark_skein_specs/analytics_test_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_test_text-refine_webrequest.expected b/tests/analytics_test/fixtures/spark_skein_specs/analytics_test_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_test_text-refine_webrequest.expected index 9ac3ca0f4..2abd55400 100644 --- a/tests/analytics_test/fixtures/spark_skein_specs/analytics_test_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_test_text-refine_webrequest.expected +++ b/tests/analytics_test/fixtures/spark_skein_specs/analytics_test_dags_webrequest_refine_webrequest_hourly_dag.py-refine_webrequest_hourly_test_text-refine_webrequest.expected @@ -31,7 +31,7 @@ master: --class org.apache.spark.sql.hive.thriftserver.WMFSparkSQLCLIDriver --queue production --deploy-mode client hdfs:///wmf/cache/artifacts/airflow/analytics_test/wmf-sparksqlclidriver-1.0.0.jar -f hdfs://analytics-test-hadoop/wmf/refinery/current/hql/webrequest/refine_webrequest_hourly.hql - -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics_test/refinery-hive-0.2.41-shaded.jar + -d refinery_jar=hdfs:///wmf/cache/artifacts/airflow/analytics_test/refinery-hive-0.2.30-shaded.jar -d source_table=wmf_raw.webrequest -d webrequest_source=test_text -d destination_table=wmf.webrequest -d year=2023 -d month=4 -d day=1 -d hour=17 -d record_version=0.0.27 -d coalesce_partitions=256 -d spark_sql_shuffle_partitions=256 -d excluded_row_ids= -- GitLab