From 4a0d41c052312cf4ff71c10bad37a0877d0c078c Mon Sep 17 00:00:00 2001 From: conniecc1 Date: Tue, 25 Feb 2025 03:39:07 -0500 Subject: [PATCH 1/2] Bug: T386672 --- ...t_reporting_system_incident_type_daily.hql | 30 ++++ ...reporting_system_overall_metrics_daily.hql | 33 ++++ ...t_reporting_system_incident_type_daily.hql | 123 ++++++++++++++ ...reporting_system_overall_metrics_daily.hql | 157 ++++++++++++++++++ 4 files changed, 343 insertions(+) create mode 100644 incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql create mode 100644 incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql create mode 100644 incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql create mode 100644 incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql diff --git a/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql b/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql new file mode 100644 index 0000000..2bc8655 --- /dev/null +++ b/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql @@ -0,0 +1,30 @@ +-- +-- Creates a table for incident types for Incident Reporting System on a daily basis. +-- +-- Parameters: +-- table_name Fully qualified name of the table to create. +-- base_directory HDFS path to use as the table's base location. +-- +-- Usage: +-- spark3-sql -f create_incident_reporting_system_incident_type_daily.hql \ +-- -d table_name=wmf_product.incident_reporting_system_incident_type_daily \ +-- -d base_directory=/wmf/data/wmf_product/incident_reporting_system/incident_type_daily + + + +CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}( + `date` date COMMENT 'YYYY-MM-DD', + wiki_db string COMMENT 'project name', + funnel_name string COMMENT 'emergency, non-emergency', + incident_type string COMMENT 'type of incident', + incident_count bigint COMMENT 'number of incidents' +) +USING ICEBERG +TBLPROPERTIES ( + 'format-version' = '2', + 'write.delete.mode' = 'copy-on-write', + 'write.parquet.compression-codec' = 'zstd' +) +LOCATION '${base_directory}' +; + diff --git a/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql b/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql new file mode 100644 index 0000000..4ec40d2 --- /dev/null +++ b/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql @@ -0,0 +1,33 @@ +-- +-- Creates a table for overall metrics for Incident Reporting System on a daily basis. +-- +-- Parameters: +-- table_name Fully qualified name of the table to create. +-- base_directory HDFS path to use as the table's base location. +-- +-- Usage: +-- spark3-sql -f create_incident_reporting_system_overall_metrics_daily.hql \ +-- -d table_name=wmf_product.incident_reporting_system_overall_metrics_daily \ +-- -d base_directory=/wmf/data/wmf_product/incident_reporting_system/overall_metrics_daily + + + +CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}( + `date` date COMMENT 'YYYY-MM-DD', + wiki_db string COMMENT 'project name', + view_count bigint COMMENT 'number of report started', + emergency_click_count bigint COMMENT 'number of clicks on emergency report', + nonemergency_click_count bigint COMMENT 'number of clicks on non-emergency report', + emergency_start_submit_coun bigint COMMENT 'number of view to submit report step of emergency report', + emergency_submitted_count bigint COMMENT 'number of submitted emergency report', + non_emergency_completed_count bigint COMMENT 'number of completion of non-emergency flow' +) +USING ICEBERG +TBLPROPERTIES ( + 'format-version' = '2', + 'write.delete.mode' = 'copy-on-write', + 'write.parquet.compression-codec' = 'zstd' +) +LOCATION '${base_directory}' +; + diff --git a/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql b/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql new file mode 100644 index 0000000..a7e32de --- /dev/null +++ b/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql @@ -0,0 +1,123 @@ +-- +-- +-- Parameters: +-- source_table -- Fully qualified table name to compute the aggregation for. +-- destination_table -- Fully qualified table name to fill in aggregated values. +-- coalesce_partitions -- Number of partitions to write +-- target_year -- Year of partition to compute aggregation for. +-- target_month -- Month of partition to compute aggregation for. +-- target_day -- Day of partition to compute aggregation for. +-- +-- Usage: +-- spark3-sql -f generate_incident_reporting_system_incident_type_daily.hql \ +-- -d source_table=event.mediawiki_product_metrics_incident_reporting_system_interaction \ +-- -d destination_table=wmf_product.incident_reporting_system_incident_type_daily \ +-- -d coalesce_partitions=1 \ +-- -d target_year= 2025 \ +-- -d target_month= 2 \ +-- -d target_day= 26 + +-- Delete existing data for the period to prevent duplication of data in case of recomputation +DELETE FROM ${destination_table} +WHERE + date = TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) +; + +-- Compute data for the period +WITH +action_data AS ( + SELECT + TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) AS date, + CONCAT(normalized_host.project, '.', normalized_host.project_class) AS wiki_db, + performer.session_id AS session_id, + action, + action_subtype, + action_context, + action_source, + funnel_name, + funnel_entry_token + FROM ${source_table} + WHERE year = ${target_year} + AND month = ${target_month} + AND day = ${target_day} + AND action IN('view', 'click') + AND normalized_host.project != 'test' +), +describe_emergency AS ( --- selected flow and emergency incident type + SELECT + date, + wiki_db, + funnel_name, + session_id, + funnel_entry_token, + GET_JSON_OBJECT(action_context, '$.harm_option') AS emergency_option + FROM action_data + WHERE action_source = 'form' + AND action_subtype = 'continue' + AND funnel_name = 'emergency' +), +describe_non_emergency AS ( --- describe non-emergency incident + SELECT + date, + wiki_db, + funnel_name, + session_id, + funnel_entry_token, + action_context AS non_emergency_option + FROM action_data + WHERE action = 'click' + AND action_subtype = 'continue' + AND action_source = 'describe_unacceptable_behavior' + AND funnel_name = 'non-emergency' + AND action_context IS NOT NULL +), +emergency_type_count AS ( + SELECT + date, + wiki_db, + funnel_name, + emergency_option AS type, + COUNT(DISTINCT(funnel_entry_token)) AS incident_count + FROM describe_emergency + GROUP BY date, wiki_db, funnel_name, type +), +non_emergency_type_count AS ( + SELECT + date, + wiki_db, + funnel_name, + non_emergency_option AS type, + COUNT(DISTINCT(funnel_entry_token)) AS incident_count + FROM describe_non_emergency + GROUP BY date, wiki_db, funnel_name, type +) + +INSERT INTO TABLE ${destination_table} +SELECT /*+ COALESCE(${coalesce_partitions}) */ + CAST(date AS DATE) AS date, + wiki_db, + funnel_name, + type, + incident_count +FROM emergency_type_count +UNION +SELECT + CAST(date AS DATE) AS date, + wiki_db, + funnel_name, + type, + incident_count +FROM non_emergency_type_count +; \ No newline at end of file diff --git a/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql b/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql new file mode 100644 index 0000000..cf08669 --- /dev/null +++ b/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql @@ -0,0 +1,157 @@ +-- +-- +-- Parameters: +-- source_table -- Fully qualified table name to compute the aggregation for. +-- destination_table -- Fully qualified table name to fill in aggregated values. +-- coalesce_partitions -- Number of partitions to write +-- target_year -- Year of partition to compute aggregation for. +-- target_month -- Month of partition to compute aggregation for. +-- target_day -- Day of partition to compute aggregation for. +-- +-- Usage: +-- spark3-sql -f generate_incident_reporting_system_incident_type_daily.hql \ +-- -d source_table=event.mediawiki_product_metrics_incident_reporting_system_interaction \ +-- -d destination_table=wmf_product.incident_reporting_system_overall_metrics_daily \ +-- -d coalesce_partitions=1 \ +-- -d target_year= 2025 \ +-- -d target_month= 2 \ +-- -d target_day= 26 + +-- Delete existing data for the period to prevent duplication of data in case of recomputation +DELETE FROM ${destination_table} +WHERE + date = TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) +; + +WITH +action_data AS ( + SELECT + TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) AS date, + CONCAT(normalized_host.project, '.', normalized_host.project_class) AS wiki_db, + performer.session_id AS session_id, + action, + action_subtype, + action_context, + action_source, + funnel_name, + funnel_entry_token, + funnel_event_sequence_position + FROM ${source_table} + WHERE year = ${target_year} + AND month = ${target_month} + AND day = ${target_day} + AND action IN('view', 'click') + AND normalized_host.project != 'test' +), +initial_form_interactions AS ( --- report start and choose flow + SELECT + date, + wiki_db, + action, + funnel_entry_token, + funnel_event_sequence_position, + action_context + FROM action_data + WHERE action_source = 'form' + AND action_subtype IS NULL + AND (action_context IS NULL OR action_context IN('emergency', 'non-emergency')) +), +submit_page_view AS ( --- user reach submit report page for emergency flow + SELECT + date, + wiki_db, + session_id, + funnel_entry_token + FROM action_data + WHERE action = 'view' + AND action_source = 'submit_report' + AND funnel_name = 'emergency' +), +submittd_emergency AS ( --- submitted emergency incident report + SELECT + date, + wiki_db, + session_id, + funnel_entry_token + FROM action_data + WHERE action = 'view' + AND action_source = 'submitted' + AND funnel_name = 'emergency' +), +get_support_click AS ( --- non-emergency get support + SELECT + date, + wiki_db, + session_id, + funnel_entry_token + FROM action_data + WHERE action = 'click' + AND action_source = 'get_support' + AND funnel_name = 'non-emergency' + +), +initial_form_counts AS ( + SELECT + date, + wiki_db, + SUM(IF(action = 'view' AND funnel_event_sequence_position = 1, 1, 0)) AS view_count, + SUM(IF(action = 'click' AND action_context = 'emergency', 1, 0)) AS emergency_click_count, + SUM(IF(action = 'click' AND action_context = 'non-emergency', 1, 0)) AS nonemergency_click_count + FROM initial_form_interactions + GROUP BY date, wiki_db +), +submit_page_count AS ( + SELECT + date, + wiki_db, + COUNT(DISTINCT(funnel_entry_token)) AS emergency_start_submit_count + FROM submit_page_view + GROUP BY date, wiki_db +), +emergency_submitted_count AS ( + SELECT + date, + wiki_db, + COUNT(DISTINCT(funnel_entry_token)) AS emergency_submitted_count + FROM submittd_emergency + GROUP BY date, wiki_db +), +non_emergency_completed_count AS ( + SELECT + date, + wiki_db, + COUNT(DISTINCT(funnel_entry_token)) AS non_emergency_completed_count + FROM get_support_click + GROUP BY date, wiki_db +) + +INSERT INTO TABLE ${destination_table} +SELECT /*+ COALESCE(${coalesce_partitions}) */ + CAST(i.date AS DATE) AS date, + i.wiki_db, + view_count, + COALESCE(emergency_click_count, 0) AS emergency_click_count, + COALESCE(nonemergency_click_count, 0) AS nonemergency_click_count, + COALESCE(emergency_start_submit_count, 0) AS emergency_start_submit_count, + COALESCE(emergency_submitted_count, 0) AS emergency_submitted_count, + COALESCE(non_emergency_completed_count, 0) AS non_emergency_completed_count +FROM initial_form_counts i +LEFT JOIN submit_page_count sp + ON (i.date = sp.date AND i.wiki_db = sp.wiki_db) +LEFT JOIN emergency_submitted_count es + ON (i.date = es.date AND i.wiki_db = es.wiki_db) +LEFT JOIN non_emergency_completed_count nec + ON (i.date = nec.date AND i.wiki_db = nec.wiki_db) +; \ No newline at end of file -- GitLab From ec93faa72584586924c6c632cd75228bd578ef2e Mon Sep 17 00:00:00 2001 From: conniecc1 Date: Fri, 28 Feb 2025 17:29:47 -0500 Subject: [PATCH 2/2] update column names --- ...create_incident_reporting_system_incident_type_daily.hql | 2 +- ...eate_incident_reporting_system_overall_metrics_daily.hql | 2 +- ...nerate_incident_reporting_system_incident_type_daily.hql | 6 +++--- ...rate_incident_reporting_system_overall_metrics_daily.hql | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql b/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql index 2bc8655..fddf70f 100644 --- a/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql +++ b/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql @@ -13,7 +13,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}( - `date` date COMMENT 'YYYY-MM-DD', + day date COMMENT 'YYYY-MM-DD', wiki_db string COMMENT 'project name', funnel_name string COMMENT 'emergency, non-emergency', incident_type string COMMENT 'type of incident', diff --git a/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql b/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql index 4ec40d2..659fefb 100644 --- a/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql +++ b/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql @@ -13,7 +13,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}( - `date` date COMMENT 'YYYY-MM-DD', + day date COMMENT 'YYYY-MM-DD', wiki_db string COMMENT 'project name', view_count bigint COMMENT 'number of report started', emergency_click_count bigint COMMENT 'number of clicks on emergency report', diff --git a/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql b/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql index a7e32de..dfd8041 100644 --- a/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql +++ b/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql @@ -20,7 +20,7 @@ -- Delete existing data for the period to prevent duplication of data in case of recomputation DELETE FROM ${destination_table} WHERE - date = TO_DATE( + day = TO_DATE( CONCAT_WS('-', LPAD(${target_year}, 4, '0'), LPAD(${target_month}, 2, '0'), @@ -106,7 +106,7 @@ non_emergency_type_count AS ( INSERT INTO TABLE ${destination_table} SELECT /*+ COALESCE(${coalesce_partitions}) */ - CAST(date AS DATE) AS date, + CAST(date AS DATE) AS day, wiki_db, funnel_name, type, @@ -114,7 +114,7 @@ SELECT /*+ COALESCE(${coalesce_partitions}) */ FROM emergency_type_count UNION SELECT - CAST(date AS DATE) AS date, + CAST(date AS DATE) AS day, wiki_db, funnel_name, type, diff --git a/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql b/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql index cf08669..9c465b0 100644 --- a/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql +++ b/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql @@ -20,7 +20,7 @@ -- Delete existing data for the period to prevent duplication of data in case of recomputation DELETE FROM ${destination_table} WHERE - date = TO_DATE( + day = TO_DATE( CONCAT_WS('-', LPAD(${target_year}, 4, '0'), LPAD(${target_month}, 2, '0'), @@ -139,7 +139,7 @@ non_emergency_completed_count AS ( INSERT INTO TABLE ${destination_table} SELECT /*+ COALESCE(${coalesce_partitions}) */ - CAST(i.date AS DATE) AS date, + CAST(i.date AS DATE) AS day, i.wiki_db, view_count, COALESCE(emergency_click_count, 0) AS emergency_click_count, -- GitLab