diff --git a/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql b/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql new file mode 100644 index 0000000000000000000000000000000000000000..fddf70fde2ec5eec86d1a71c0e46d1cc7eca3ec8 --- /dev/null +++ b/incident_reporting_system/create_incident_reporting_system_incident_type_daily.hql @@ -0,0 +1,30 @@ +-- +-- Creates a table for incident types for Incident Reporting System on a daily basis. +-- +-- Parameters: +-- table_name Fully qualified name of the table to create. +-- base_directory HDFS path to use as the table's base location. +-- +-- Usage: +-- spark3-sql -f create_incident_reporting_system_incident_type_daily.hql \ +-- -d table_name=wmf_product.incident_reporting_system_incident_type_daily \ +-- -d base_directory=/wmf/data/wmf_product/incident_reporting_system/incident_type_daily + + + +CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}( + day date COMMENT 'YYYY-MM-DD', + wiki_db string COMMENT 'project name', + funnel_name string COMMENT 'emergency, non-emergency', + incident_type string COMMENT 'type of incident', + incident_count bigint COMMENT 'number of incidents' +) +USING ICEBERG +TBLPROPERTIES ( + 'format-version' = '2', + 'write.delete.mode' = 'copy-on-write', + 'write.parquet.compression-codec' = 'zstd' +) +LOCATION '${base_directory}' +; + diff --git a/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql b/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql new file mode 100644 index 0000000000000000000000000000000000000000..659fefba221b0ef534c05174988c1f9d44048792 --- /dev/null +++ b/incident_reporting_system/create_incident_reporting_system_overall_metrics_daily.hql @@ -0,0 +1,33 @@ +-- +-- Creates a table for overall metrics for Incident Reporting System on a daily basis. +-- +-- Parameters: +-- table_name Fully qualified name of the table to create. +-- base_directory HDFS path to use as the table's base location. +-- +-- Usage: +-- spark3-sql -f create_incident_reporting_system_overall_metrics_daily.hql \ +-- -d table_name=wmf_product.incident_reporting_system_overall_metrics_daily \ +-- -d base_directory=/wmf/data/wmf_product/incident_reporting_system/overall_metrics_daily + + + +CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}( + day date COMMENT 'YYYY-MM-DD', + wiki_db string COMMENT 'project name', + view_count bigint COMMENT 'number of report started', + emergency_click_count bigint COMMENT 'number of clicks on emergency report', + nonemergency_click_count bigint COMMENT 'number of clicks on non-emergency report', + emergency_start_submit_coun bigint COMMENT 'number of view to submit report step of emergency report', + emergency_submitted_count bigint COMMENT 'number of submitted emergency report', + non_emergency_completed_count bigint COMMENT 'number of completion of non-emergency flow' +) +USING ICEBERG +TBLPROPERTIES ( + 'format-version' = '2', + 'write.delete.mode' = 'copy-on-write', + 'write.parquet.compression-codec' = 'zstd' +) +LOCATION '${base_directory}' +; + diff --git a/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql b/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql new file mode 100644 index 0000000000000000000000000000000000000000..dfd8041511db30e7cb38173ecd678bc6c8f08456 --- /dev/null +++ b/incident_reporting_system/generate_incident_reporting_system_incident_type_daily.hql @@ -0,0 +1,123 @@ +-- +-- +-- Parameters: +-- source_table -- Fully qualified table name to compute the aggregation for. +-- destination_table -- Fully qualified table name to fill in aggregated values. +-- coalesce_partitions -- Number of partitions to write +-- target_year -- Year of partition to compute aggregation for. +-- target_month -- Month of partition to compute aggregation for. +-- target_day -- Day of partition to compute aggregation for. +-- +-- Usage: +-- spark3-sql -f generate_incident_reporting_system_incident_type_daily.hql \ +-- -d source_table=event.mediawiki_product_metrics_incident_reporting_system_interaction \ +-- -d destination_table=wmf_product.incident_reporting_system_incident_type_daily \ +-- -d coalesce_partitions=1 \ +-- -d target_year= 2025 \ +-- -d target_month= 2 \ +-- -d target_day= 26 + +-- Delete existing data for the period to prevent duplication of data in case of recomputation +DELETE FROM ${destination_table} +WHERE + day = TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) +; + +-- Compute data for the period +WITH +action_data AS ( + SELECT + TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) AS date, + CONCAT(normalized_host.project, '.', normalized_host.project_class) AS wiki_db, + performer.session_id AS session_id, + action, + action_subtype, + action_context, + action_source, + funnel_name, + funnel_entry_token + FROM ${source_table} + WHERE year = ${target_year} + AND month = ${target_month} + AND day = ${target_day} + AND action IN('view', 'click') + AND normalized_host.project != 'test' +), +describe_emergency AS ( --- selected flow and emergency incident type + SELECT + date, + wiki_db, + funnel_name, + session_id, + funnel_entry_token, + GET_JSON_OBJECT(action_context, '$.harm_option') AS emergency_option + FROM action_data + WHERE action_source = 'form' + AND action_subtype = 'continue' + AND funnel_name = 'emergency' +), +describe_non_emergency AS ( --- describe non-emergency incident + SELECT + date, + wiki_db, + funnel_name, + session_id, + funnel_entry_token, + action_context AS non_emergency_option + FROM action_data + WHERE action = 'click' + AND action_subtype = 'continue' + AND action_source = 'describe_unacceptable_behavior' + AND funnel_name = 'non-emergency' + AND action_context IS NOT NULL +), +emergency_type_count AS ( + SELECT + date, + wiki_db, + funnel_name, + emergency_option AS type, + COUNT(DISTINCT(funnel_entry_token)) AS incident_count + FROM describe_emergency + GROUP BY date, wiki_db, funnel_name, type +), +non_emergency_type_count AS ( + SELECT + date, + wiki_db, + funnel_name, + non_emergency_option AS type, + COUNT(DISTINCT(funnel_entry_token)) AS incident_count + FROM describe_non_emergency + GROUP BY date, wiki_db, funnel_name, type +) + +INSERT INTO TABLE ${destination_table} +SELECT /*+ COALESCE(${coalesce_partitions}) */ + CAST(date AS DATE) AS day, + wiki_db, + funnel_name, + type, + incident_count +FROM emergency_type_count +UNION +SELECT + CAST(date AS DATE) AS day, + wiki_db, + funnel_name, + type, + incident_count +FROM non_emergency_type_count +; \ No newline at end of file diff --git a/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql b/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql new file mode 100644 index 0000000000000000000000000000000000000000..9c465b0511c0cd79a1092b40590e2dc4c1d2f434 --- /dev/null +++ b/incident_reporting_system/generate_incident_reporting_system_overall_metrics_daily.hql @@ -0,0 +1,157 @@ +-- +-- +-- Parameters: +-- source_table -- Fully qualified table name to compute the aggregation for. +-- destination_table -- Fully qualified table name to fill in aggregated values. +-- coalesce_partitions -- Number of partitions to write +-- target_year -- Year of partition to compute aggregation for. +-- target_month -- Month of partition to compute aggregation for. +-- target_day -- Day of partition to compute aggregation for. +-- +-- Usage: +-- spark3-sql -f generate_incident_reporting_system_incident_type_daily.hql \ +-- -d source_table=event.mediawiki_product_metrics_incident_reporting_system_interaction \ +-- -d destination_table=wmf_product.incident_reporting_system_overall_metrics_daily \ +-- -d coalesce_partitions=1 \ +-- -d target_year= 2025 \ +-- -d target_month= 2 \ +-- -d target_day= 26 + +-- Delete existing data for the period to prevent duplication of data in case of recomputation +DELETE FROM ${destination_table} +WHERE + day = TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) +; + +WITH +action_data AS ( + SELECT + TO_DATE( + CONCAT_WS('-', + LPAD(${target_year}, 4, '0'), + LPAD(${target_month}, 2, '0'), + LPAD(${target_day}, 2, '0')), + 'yyyy-MM-dd' + ) AS date, + CONCAT(normalized_host.project, '.', normalized_host.project_class) AS wiki_db, + performer.session_id AS session_id, + action, + action_subtype, + action_context, + action_source, + funnel_name, + funnel_entry_token, + funnel_event_sequence_position + FROM ${source_table} + WHERE year = ${target_year} + AND month = ${target_month} + AND day = ${target_day} + AND action IN('view', 'click') + AND normalized_host.project != 'test' +), +initial_form_interactions AS ( --- report start and choose flow + SELECT + date, + wiki_db, + action, + funnel_entry_token, + funnel_event_sequence_position, + action_context + FROM action_data + WHERE action_source = 'form' + AND action_subtype IS NULL + AND (action_context IS NULL OR action_context IN('emergency', 'non-emergency')) +), +submit_page_view AS ( --- user reach submit report page for emergency flow + SELECT + date, + wiki_db, + session_id, + funnel_entry_token + FROM action_data + WHERE action = 'view' + AND action_source = 'submit_report' + AND funnel_name = 'emergency' +), +submittd_emergency AS ( --- submitted emergency incident report + SELECT + date, + wiki_db, + session_id, + funnel_entry_token + FROM action_data + WHERE action = 'view' + AND action_source = 'submitted' + AND funnel_name = 'emergency' +), +get_support_click AS ( --- non-emergency get support + SELECT + date, + wiki_db, + session_id, + funnel_entry_token + FROM action_data + WHERE action = 'click' + AND action_source = 'get_support' + AND funnel_name = 'non-emergency' + +), +initial_form_counts AS ( + SELECT + date, + wiki_db, + SUM(IF(action = 'view' AND funnel_event_sequence_position = 1, 1, 0)) AS view_count, + SUM(IF(action = 'click' AND action_context = 'emergency', 1, 0)) AS emergency_click_count, + SUM(IF(action = 'click' AND action_context = 'non-emergency', 1, 0)) AS nonemergency_click_count + FROM initial_form_interactions + GROUP BY date, wiki_db +), +submit_page_count AS ( + SELECT + date, + wiki_db, + COUNT(DISTINCT(funnel_entry_token)) AS emergency_start_submit_count + FROM submit_page_view + GROUP BY date, wiki_db +), +emergency_submitted_count AS ( + SELECT + date, + wiki_db, + COUNT(DISTINCT(funnel_entry_token)) AS emergency_submitted_count + FROM submittd_emergency + GROUP BY date, wiki_db +), +non_emergency_completed_count AS ( + SELECT + date, + wiki_db, + COUNT(DISTINCT(funnel_entry_token)) AS non_emergency_completed_count + FROM get_support_click + GROUP BY date, wiki_db +) + +INSERT INTO TABLE ${destination_table} +SELECT /*+ COALESCE(${coalesce_partitions}) */ + CAST(i.date AS DATE) AS day, + i.wiki_db, + view_count, + COALESCE(emergency_click_count, 0) AS emergency_click_count, + COALESCE(nonemergency_click_count, 0) AS nonemergency_click_count, + COALESCE(emergency_start_submit_count, 0) AS emergency_start_submit_count, + COALESCE(emergency_submitted_count, 0) AS emergency_submitted_count, + COALESCE(non_emergency_completed_count, 0) AS non_emergency_completed_count +FROM initial_form_counts i +LEFT JOIN submit_page_count sp + ON (i.date = sp.date AND i.wiki_db = sp.wiki_db) +LEFT JOIN emergency_submitted_count es + ON (i.date = es.date AND i.wiki_db = es.wiki_db) +LEFT JOIN non_emergency_completed_count nec + ON (i.date = nec.date AND i.wiki_db = nec.wiki_db) +; \ No newline at end of file