diff --git a/main/dags/commons/commons_impact_metrics_monthly_dag.py b/main/dags/commons/commons_impact_metrics_monthly_dag.py index 1e27bdd9e1d949e4f120130d4c5a5eefd0487ec6..74581cb3c0f1c2d30a56d15b845e80653c1c99f6 100644 --- a/main/dags/commons/commons_impact_metrics_monthly_dag.py +++ b/main/dags/commons/commons_impact_metrics_monthly_dag.py @@ -8,7 +8,8 @@ Generates the 5 main Iceberg datasets of the Commons Impact Metrics data product Uses the following data sources: * wmf_raw.mediawiki_page - * wmf_raw.mediawiki_image + * wmf_raw.mediawiki_file + * wmf_raw.mediawiki_filetypes * wmf_raw.mediawiki_imagelinks * wmf_raw.mediawiki_categorylinks * wmf_raw.mediawiki_revision @@ -48,7 +49,8 @@ props = DagProperties( start_date=datetime(2025, 2, 1), # Source tables. mediawiki_page_table="wmf_raw.mediawiki_page", - mediawiki_image_table="wmf_raw.mediawiki_image", + mediawiki_file_table="wmf_raw.mediawiki_file", + mediawiki_filetypes_table="wmf_raw.mediawiki_filetypes", mediawiki_imagelinks_table="wmf_raw.mediawiki_imagelinks", mediawiki_categorylinks_table="wmf_raw.mediawiki_categorylinks", mediawiki_revision_table="wmf_raw.mediawiki_revision", @@ -126,7 +128,8 @@ tags = [ "uses_spark", "uses_hql", "requires_wmf_raw_mediawiki_page", - "requires_wmf_raw_mediawiki_image", + "requires_wmf_raw_mediawiki_file", + "requires_wmf_raw_mediawiki_filetypes", "requires_wmf_raw_mediawiki_imagelinks", "requires_wmf_raw_mediawiki_categorylinks", "requires_wmf_raw_mediawiki_revision", @@ -158,10 +161,16 @@ with create_easy_dag( external_task_id="write_page_table_partitioned_file", ) - wait_for_mediawiki_image = ExternalTaskSensor( - task_id="wait_for_mediawiki_image", + wait_for_mediawiki_file = ExternalTaskSensor( + task_id="wait_for_mediawiki_file", external_dag_id="mediawiki_history_load", - external_task_id="write_image_table_partitioned_file", + external_task_id="write_file_table_partitioned_file", + ) + + wait_for_mediawiki_filetypes = ExternalTaskSensor( + task_id="wait_for_mediawiki_filetypes", + external_dag_id="mediawiki_history_load", + external_task_id="write_filetypes_table_partitioned_file", ) wait_for_mediawiki_imagelinks = ExternalTaskSensor( @@ -250,7 +259,8 @@ with create_easy_dag( sql=props.commons_media_file_metrics_snapshot_hql, query_parameters={ "category_and_media_with_usage_map_table": category_and_media_with_usage_map_table, - "mediawiki_image_table": props.mediawiki_image_table, + "mediawiki_file_table": props.mediawiki_file_table, + "mediawiki_filetypes_table": props.mediawiki_filetypes_table, "commons_media_file_metrics_snapshot_table": props.commons_media_file_metrics_snapshot_table, "year_month": year_month, "coalesce_partitions": props.hql_output_partitions, @@ -317,7 +327,8 @@ with create_easy_dag( chain( # Wait for source data. wait_for_mediawiki_page, - wait_for_mediawiki_image, + wait_for_mediawiki_file, + wait_for_mediawiki_filetypes, wait_for_mediawiki_imagelinks, wait_for_mediawiki_categorylinks, wait_for_mediawiki_revision, diff --git a/main/dags/mediawiki/mediawiki_history_load_dag.py b/main/dags/mediawiki/mediawiki_history_load_dag.py index 2b85094cdaaf2b5c969cec66326dc4b028fc5bb6..fd7f361af3bb328f62af88bdfedbf0c29777a85f 100644 --- a/main/dags/mediawiki/mediawiki_history_load_dag.py +++ b/main/dags/mediawiki/mediawiki_history_load_dag.py @@ -17,7 +17,7 @@ TODO: this DAG waits for all sqoops to be done, but that means most downstream j 1a. cu_changes, cu_log 1b. archive,change_tag,change_tag_def,logging,page,revision,user,user_groups - 2. category,categorylinks,content,content_models,externallinks,image,imagelinks,ipblocks, + 2. category,categorylinks,content,content_models,externallinks,file,filetypes,image,imagelinks,ipblocks, ipblocks_restrictions,iwlinks,langlinks,pagelinks,page_props,page_restrictions,redirect, slots,slot_roles,templatelinks,user_properties,wbc_entity_usage 3. actor,comment @@ -48,6 +48,8 @@ public_tables = [ "content", "content_models", "externallinks", + "file", + "filetypes", "image", "imagelinks", "ipblocks", diff --git a/tests/main/commons/commons_impact_metrics_monthly_dag_test.py b/tests/main/commons/commons_impact_metrics_monthly_dag_test.py index 53dcc53ef1a2212d6073097634b85899992d14f3..9ae0077f613d5c4b823436341f80895de9b8cf70 100644 --- a/tests/main/commons/commons_impact_metrics_monthly_dag_test.py +++ b/tests/main/commons/commons_impact_metrics_monthly_dag_test.py @@ -11,4 +11,4 @@ def test_commons_impact_metrics_monthly_dag_loaded(dagbag): assert dagbag.import_errors == {} dag = dagbag.get_dag(dag_id="commons_impact_metrics_monthly") assert dag is not None - assert len(dag.tasks) == 16 + assert len(dag.tasks) == 17 diff --git a/tests/main/fixtures/spark_skein_specs/main_dags_commons_commons_impact_metrics_monthly_dag.py-commons_impact_metrics_monthly-compute_media_file_metrics.expected b/tests/main/fixtures/spark_skein_specs/main_dags_commons_commons_impact_metrics_monthly_dag.py-commons_impact_metrics_monthly-compute_media_file_metrics.expected index b4bbe6edb130072daf82f5c5b80e88b083d868b4..833b82f1bd32d2e4e6f2014c61ac887e2ebb060c 100644 --- a/tests/main/fixtures/spark_skein_specs/main_dags_commons_commons_impact_metrics_monthly_dag.py-commons_impact_metrics_monthly-compute_media_file_metrics.expected +++ b/tests/main/fixtures/spark_skein_specs/main_dags_commons_commons_impact_metrics_monthly_dag.py-commons_impact_metrics_monthly-compute_media_file_metrics.expected @@ -69,7 +69,9 @@ master: -d \ category_and_media_with_usage_map_table=tmp.category_and_media_with_usage_map_2025_03 \ -d \ - mediawiki_image_table=wmf_raw.mediawiki_image \ + mediawiki_file_table=wmf_raw.mediawiki_file \ + -d \ + mediawiki_filetypes_table=wmf_raw.mediawiki_filetypes \ -d \ commons_media_file_metrics_snapshot_table=wmf_contributors.commons_media_file_metrics_snapshot \ -d \ diff --git a/tests/main/fixtures/spark_skein_specs/main_dags_mediawiki_mediawiki_history_load_dag.py-mediawiki_history_load-repair_file_table.expected b/tests/main/fixtures/spark_skein_specs/main_dags_mediawiki_mediawiki_history_load_dag.py-mediawiki_history_load-repair_file_table.expected new file mode 100644 index 0000000000000000000000000000000000000000..6fa5338f0ccf26c29cc42e5926cac89bcc075904 --- /dev/null +++ b/tests/main/fixtures/spark_skein_specs/main_dags_mediawiki_mediawiki_history_load_dag.py-mediawiki_history_load-repair_file_table.expected @@ -0,0 +1,93 @@ +acls: + enable: false + modify_groups: [] + modify_users: [] + ui_users: [] + view_groups: [] + view_users: [] +file_systems: [] +master: + env: + SPARK_CONF_DIR: /etc/spark3/conf + SPARK_HOME: /usr/lib/spark3 + files: + airflow.keytab: + size: 0 + source: file:///pytest/path/to/airflow.keytab + timestamp: 0 + type: FILE + visibility: APPLICATION + log_level: INFO + resources: + fpgas: 0 + gpus: 0 + memory: 4096 + vcores: 2 + script: |- + spark3-submit \ + --driver-cores \ + 2 \ + --conf \ + spark.executorEnv.SPARK_HOME=/usr/lib/spark3 \ + --conf \ + spark.executorEnv.SPARK_CONF_DIR=/etc/spark3/conf \ + --master \ + yarn \ + --conf \ + spark.datahub.emitter=kafka \ + --conf \ + spark.extraListeners=datahub.spark.DatahubSparkListener \ + --conf \ + spark.datahub.kafka.bootstrap=test.kafka.jumbo:9092 \ + --conf \ + spark.datahub.kafka.schema_registry_url=https://test.schema.registry:30443/schema-registry/api/ \ + --conf \ + spark.datahub.flow_name=airflow_mediawiki_history_load__repair_file_table \ + --conf \ + spark.datahub.log.mcps=false \ + --conf \ + spark.dynamicAllocation.enabled=true \ + --conf \ + spark.dynamicAllocation.maxExecutors=16 \ + --conf \ + spark.shuffle.service.enabled=true \ + --conf \ + spark.yarn.maxAppAttempts=1 \ + --conf \ + spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 \ + --conf \ + spark.yarn.appMasterEnv.SPARK_CONF_DIR=/etc/spark3/conf \ + --conf \ + spark.yarn.appMasterEnv.SPARK_HOME=/usr/lib/spark3 \ + --jars \ + hdfs:///wmf/cache/artifacts/airflow/analytics/acryl-spark-lineage-0.2.16.jar \ + --executor-cores \ + 2 \ + --executor-memory \ + 4G \ + --driver-memory \ + 4G \ + --keytab \ + airflow.keytab \ + --principal \ + airflow \ + --name \ + mediawiki_history_load__repair_file_table__20250301 \ + --class \ + org.apache.spark.sql.hive.thriftserver.WMFSparkSQLCLIDriver \ + --queue \ + production \ + --deploy-mode \ + client \ + hdfs:///wmf/cache/artifacts/airflow/analytics/wmf-sparksqlclidriver-1.0.0.jar \ + -f \ + hdfs://analytics-hadoop/wmf/refinery/current/hql/utils/repair_partitions.hql \ + -d \ + table=wmf_raw.mediawiki_file +max_attempts: 1 +name: Airflow SparkSkeinSubmitHook skein launcher mediawiki_history_load__repair_file_table__20250301 +node_label: '' +queue: production +services: {} +tags: [] +user: '' diff --git a/tests/main/fixtures/spark_skein_specs/main_dags_mediawiki_mediawiki_history_load_dag.py-mediawiki_history_load-repair_filetypes_table.expected b/tests/main/fixtures/spark_skein_specs/main_dags_mediawiki_mediawiki_history_load_dag.py-mediawiki_history_load-repair_filetypes_table.expected new file mode 100644 index 0000000000000000000000000000000000000000..b02c9db6ccdfe71329856aa241a933e980a153d2 --- /dev/null +++ b/tests/main/fixtures/spark_skein_specs/main_dags_mediawiki_mediawiki_history_load_dag.py-mediawiki_history_load-repair_filetypes_table.expected @@ -0,0 +1,93 @@ +acls: + enable: false + modify_groups: [] + modify_users: [] + ui_users: [] + view_groups: [] + view_users: [] +file_systems: [] +master: + env: + SPARK_CONF_DIR: /etc/spark3/conf + SPARK_HOME: /usr/lib/spark3 + files: + airflow.keytab: + size: 0 + source: file:///pytest/path/to/airflow.keytab + timestamp: 0 + type: FILE + visibility: APPLICATION + log_level: INFO + resources: + fpgas: 0 + gpus: 0 + memory: 4096 + vcores: 2 + script: |- + spark3-submit \ + --driver-cores \ + 2 \ + --conf \ + spark.executorEnv.SPARK_HOME=/usr/lib/spark3 \ + --conf \ + spark.executorEnv.SPARK_CONF_DIR=/etc/spark3/conf \ + --master \ + yarn \ + --conf \ + spark.datahub.emitter=kafka \ + --conf \ + spark.extraListeners=datahub.spark.DatahubSparkListener \ + --conf \ + spark.datahub.kafka.bootstrap=test.kafka.jumbo:9092 \ + --conf \ + spark.datahub.kafka.schema_registry_url=https://test.schema.registry:30443/schema-registry/api/ \ + --conf \ + spark.datahub.flow_name=airflow_mediawiki_history_load__repair_filetypes_table \ + --conf \ + spark.datahub.log.mcps=false \ + --conf \ + spark.dynamicAllocation.enabled=true \ + --conf \ + spark.dynamicAllocation.maxExecutors=16 \ + --conf \ + spark.shuffle.service.enabled=true \ + --conf \ + spark.yarn.maxAppAttempts=1 \ + --conf \ + spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 \ + --conf \ + spark.yarn.appMasterEnv.SPARK_CONF_DIR=/etc/spark3/conf \ + --conf \ + spark.yarn.appMasterEnv.SPARK_HOME=/usr/lib/spark3 \ + --jars \ + hdfs:///wmf/cache/artifacts/airflow/analytics/acryl-spark-lineage-0.2.16.jar \ + --executor-cores \ + 2 \ + --executor-memory \ + 4G \ + --driver-memory \ + 4G \ + --keytab \ + airflow.keytab \ + --principal \ + airflow \ + --name \ + mediawiki_history_load__repair_filetypes_table__20250301 \ + --class \ + org.apache.spark.sql.hive.thriftserver.WMFSparkSQLCLIDriver \ + --queue \ + production \ + --deploy-mode \ + client \ + hdfs:///wmf/cache/artifacts/airflow/analytics/wmf-sparksqlclidriver-1.0.0.jar \ + -f \ + hdfs://analytics-hadoop/wmf/refinery/current/hql/utils/repair_partitions.hql \ + -d \ + table=wmf_raw.mediawiki_filetypes +max_attempts: 1 +name: Airflow SparkSkeinSubmitHook skein launcher mediawiki_history_load__repair_filetypes_table__20250301 +node_label: '' +queue: production +services: {} +tags: [] +user: '' diff --git a/tests/main/mediawiki/mediawiki_history_load_dag_test.py b/tests/main/mediawiki/mediawiki_history_load_dag_test.py index 9a4cb0a8942959d8ddbe332617fc8d14cef40784..31b0e9c0fdc3582fa2c245e5c4d006e5e73b241b 100644 --- a/tests/main/mediawiki/mediawiki_history_load_dag_test.py +++ b/tests/main/mediawiki/mediawiki_history_load_dag_test.py @@ -15,5 +15,5 @@ def fixture_dagbag(): def test_mediawiki_history_load_dag(dagbag): assert not dagbag.import_errors dag = dagbag.get_dag(dag_id="mediawiki_history_load") - assert len(dag.tasks) == 105 + assert len(dag.tasks) == 111 assert dag.max_active_tasks == 10