diff --git a/.gitignore b/.gitignore index 05a1ce789efa93196605f91ccc063d6a1b39db99..11f78d81fa4d6ef63c0cccdfbf1ff197f3aabc95 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ build/ *.egg-info/ .python-version .venv/ -*.pyc \ No newline at end of file +*.pyc +.env diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 078217291d8aff28495292a1603a2fc9597303d7..5ccf8b33ef87c3ff20713b83eeaa4fdd6f0ed84a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,10 @@ # This is the root CI pipeline. # Its only job is to trigger the correct downstream pipelines based on changed files. + +# This is the root CI pipeline. +# Its only job is to trigger the correct downstream pipelines based on changed files. + stages: - trigger @@ -41,6 +45,18 @@ trigger-tonecheck-hello-pipeline: - tests/common_utils/**/* - tests/tone_check/hello_world/* +trigger-add_a_link-pipeline: + stage: trigger + trigger: + include: training/add_a_link/.gitlab-ci.yml + strategy: depend + rules: + - changes: + - training/add_a_link/**/* + - training/common_utils/**/* + - tests/training/add_a_link/**/* + + # --- Future-proofing --- # When we add a 'model_2' project, we would add a similar job here: # @@ -52,4 +68,4 @@ trigger-tonecheck-hello-pipeline: # rules: # - changes: # - model_2/**/* -# - common_utils/**/* \ No newline at end of file +# - common_utils/**/* diff --git a/tests/training/add_a_link/conftest.py b/tests/training/add_a_link/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..00380704e9f2013c4efe08090ea4152efdbd6e93 --- /dev/null +++ b/tests/training/add_a_link/conftest.py @@ -0,0 +1,20 @@ +import pytest +from pyspark.sql import DataFrame, SparkSession +from collections.abc import Generator +from pyspark.sql.types import ( + StructType, StructField, StringType, LongType, MapType, StructType, StructField +) +import os + +os.environ["OBJC_DISABLE_INITIALIZE_FORK_SAFETY"] = "YES" + +@pytest.fixture(scope="session") +def spark() -> Generator[SparkSession, None, None]: + """Spark session with eager evaluation enabled.""" + spark = (SparkSession.builder.master("local[*]").appName("test") + .config("spark.executor.memory", "1g") + .config("spark.driver.memory", "1g") + .getOrCreate() + ) + yield spark + spark.stop() diff --git a/tests/training/add_a_link/test_generate_anchor_dictionary.py b/tests/training/add_a_link/test_generate_anchor_dictionary.py new file mode 100644 index 0000000000000000000000000000000000000000..82d73a69cac087b9d0b946b1874e23492de69389 --- /dev/null +++ b/tests/training/add_a_link/test_generate_anchor_dictionary.py @@ -0,0 +1,45 @@ +import pytest +from pyspark.sql import SparkSession +from pyspark.sql import Row +from add_a_link import generate_anchor_dictionary as gad + +def test_enrich_with_articles_basic(spark): + data = [ + {"wiki_db": "enwiki", "page_namespace": 0, "page_id": 1, "page_title": "Test Page", "page_redirect_title": "", "revision_text": "Some text"}, + {"wiki_db": "enwiki", "page_namespace": 0, "page_id": 2, "page_title": "Redirect Page", "page_redirect_title": "Target", "revision_text": "#REDIRECT [[Target]]"}, + {"wiki_db": "enwiki", "page_namespace": 1, "page_id": 3, "page_title": "Talk:Test", "page_redirect_title": "", "revision_text": "Talk page"}, + {"wiki_db": "frwiki", "page_namespace": 0, "page_id": 4, "page_title": "French Page", "page_redirect_title": "", "revision_text": "Texte"}, + ] + df = spark.createDataFrame(data) + result = gad.enrich_with_articles(df, ["enwiki"]) + rows = result.collect() + # Only non-redirect, namespace 0, enwiki + assert len(rows) == 1 + assert rows[0]["title"].lower() == "test page" + assert rows[0]["wiki_db"] == "enwiki" + +def test_enrich_with_redirects_basic(spark): + data = [ + {"wiki_db": "enwiki", "page_namespace": 0, "page_id": 1, "page_title": "Test Page", "page_redirect_title": "", "revision_text": "Some text"}, + {"wiki_db": "enwiki", "page_namespace": 0, "page_id": 2, "page_title": "Redirect Page", "page_redirect_title": "Target", "revision_text": "#REDIRECT [[Target]]"}, + {"wiki_db": "enwiki", "page_namespace": 1, "page_id": 3, "page_title": "Talk:Test", "page_redirect_title": "", "revision_text": "Talk page"}, + {"wiki_db": "frwiki", "page_namespace": 0, "page_id": 4, "page_title": "French Page", "page_redirect_title": "", "revision_text": "Texte"}, + ] + df = spark.createDataFrame(data) + result = gad.enrich_with_redirects(df, ["enwiki"]) + rows = result.collect() + # Only redirect, namespace 0, enwiki + assert len(rows) == 1 + assert rows[0]["title_from"].lower() == "redirect page" + assert rows[0]["title_to"].lower() == "target" + assert rows[0]["wiki_db"] == "enwiki" + +def test_links_udf(): + wikitext = "This is a [[Link1|Anchor1]] and [[Link2]]." + result = gad.links.func(wikitext) + # Should extract two links + assert len(result) == 2 + anchors = {pl.anchor for pl in result} + links = {pl.link for pl in result} + assert "anchor1" in anchors or "link1" in anchors + assert "Link2" in links \ No newline at end of file diff --git a/training/add_a_link/.gitlab-ci.yml b/training/add_a_link/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..b0c4c80c8fc95cfdf49dde2fde7e5a13b0fa4430 --- /dev/null +++ b/training/add_a_link/.gitlab-ci.yml @@ -0,0 +1,48 @@ +# Include conda_artifact_repo.yml to add release and conda env publishing jobs. + +default: + tags: [memory-optimized] + + +stages: + - test + - deploy + + +include: + - project: 'repos/data-engineering/workflow_utils' + ref: v0.25.0 + file: '/gitlab_ci_templates/pipelines/conda_artifact_repo.yml' + +variables: + # This path is kept identical to where `conda-dist` creates a conda env so that once + # we're done packing the env, the rest of the jobs can find it in the expected location. + CONDA_ENV_PATH: "$CI_PROJECT_DIR/dist/conda_dist_env" + + +.job_template: + before_script: + - !reference [.conda_setup_script] + - cd training/add_a_link + - conda env create -f conda-environment.yaml + - conda activate add_a_link + - pip install -e ".[dev]" + + +publish_conda_env: + stage: deploy + before_script: + - cd training/add_a_link + # Tell the template which Python package to inspect for its version. + variables: + PACKAGE_NAME: add_a_link + rules: + # Rule 1: If the commit is a tag, run this job automatically. + - if: $CI_COMMIT_TAG + when: on_success + +pytest: + stage: test + extends: .job_template + script: + - pytest ../../tests/training/add_a_link \ No newline at end of file diff --git a/training/add_a_link/conda-environment.yaml b/training/add_a_link/conda-environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5842bb427d586a45e550d43fc36d3acdd967ad48 --- /dev/null +++ b/training/add_a_link/conda-environment.yaml @@ -0,0 +1,13 @@ +name: add_a_link +channels: + - conda-forge + - default +dependencies: + - python=3.10 + - conda-forge::pyarrow=19.0.0 + - conda-forge::icu + - conda-forge::libxml2<2.14 + - pip + - pip: + - pipx + - -e . diff --git a/training/add_a_link/pyproject.toml b/training/add_a_link/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..8fb7a0927c66d37c316de7d65915551499d18138 --- /dev/null +++ b/training/add_a_link/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "add_a_link" +version = "0.1.1" +description = "Job logic for the 'add_a_link' pipeline" +authors = [ { name = "ML Team", email = "ml@wikimedia.org" } ] +requires-python = ">=3.10" +dependencies = [ + "pyspark==3.1.2", + "typer>=0.9.0", + "rbloom==1.5.1", + "mwtokenizer>=0.2.0", + "wikitextparser>=0.55.13", + "pyicu>=2.9", + "python-Levenshtein>=0.20.8", + "pyarrow>=18.1.0", + "mwparserfromhell>=0.6.5", + "numpy==1.24.4", + "pandas==1.5.3", + "scipy==1.9.3", + "requests", + "xgboost==2.1.3", + "scikit-learn==1.0.2", + "ipykernel==6.29.5", + "research-common @ git+https://gitlab.wikimedia.org/repos/research/research-common.git@v0.2.0", +] + +[project.optional-dependencies] +dev = [ + "ruff >= 0.1.2", + "mypy >= 1.6.1", + "pre-commit >= 3.5.0", + "pytest >= 8.4.1", + "pytest-cov >= 4.1.0", + "chispa >= 0.9.4", + "pandas-stubs >= 1.2", + "syrupy >= 4.6.1", + "types-requests >= 2.31.0.20240311", + "matplotlib >= 3.8.4", +] + + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +# These scripts are the entrypoints the Airflow DAG will call. +# They map a command-line name to a Python function. +[project.scripts] +"generate_anchor_dictionary.py" = "add_a_link.generate_anchor_dictionary:main" diff --git a/training/add_a_link/src/add_a_link/README.md b/training/add_a_link/src/add_a_link/README.md new file mode 100644 index 0000000000000000000000000000000000000000..33d20b34d0a2c2d6e6ed7ab9ae73e118aeb621e0 --- /dev/null +++ b/training/add_a_link/src/add_a_link/README.md @@ -0,0 +1,72 @@ +# Add-a-link + +This is the repository that backs the [Wikimedia Link Recommendation service](https://wikitech.wikimedia.org/wiki/Add_Link). + +It contains code for training a model and generating datasets. + +The method is context-free and can be scaled to (virtually) any language, provided that we have enough existing links to learn from. + +## Setting up the local environment + +### Mac + +``` +# step:1 Set up conda: +Install from https://www.anaconda.com/download + +# Step2: Set up Spark: +brew install libxdmcp +brew install libx11 +brew install openjdk@11 +# export suggested paths by brew + +# Step3: Set up environment +cd add_a_link +conda env create -f conda-environment.yaml +conda activate add_a_link +pip install -e ".[dev]" + +# Step4: Run tests: +pre-commit run --files $(find add_a_link/src/add_a_link -name "*.py") +pytest ../tests/add_a_link + +``` + +### Docker +#:TODO + +## Training the model + +The full pipeline to train the model and generate the underlying datasets for a Wikipedia is run as a dag in airflow. Follwing is a description of all the scripts used to collect data and train a language agnostic link recommendation model. + +#### generate_anchor_dictionary_spark.py +Spark-job that generates the anchor dictionary (\*.anchors.pkl) and helper-dictionaries for lookup (\*.pageids.pkl, \*.redirects.pkl) from dumps. This generates the following files: +- ```//.anchors.pkl``` Format: {mention: [candidate_link_title:number_of_occurrences,] } +- ```//.pageids.pkl``` Format: {page_id:page_title} +- ```//.redirects.pkl``` Format: {redirect_from_title:redirect_to_title} + +#### generate_wdproperties_spark.py +Spark-job that generates the wikidata-property dictionary (\*.wdproperties.pkl). For each pageid, it stores the Wikidata-items listed as values for a pre-defined set of properties (e.g. P31). This generates the following files: +- ```//.wdproperties.pkl``` Format: {page_id:[wikidata_item,]} + +#### filter_dict_anchor.py +Filters all pages from the anchor-dictionary that have a Wikidata-property from a pre-defined set (e.g. instance_of=date). The filter is defined manually at the beginning of the script. This generates the following files: +- ```//.anchors.pkl``` Note:this file already exists before and is only filtered so that some items are removed + +#### generate_backtesting_data.py +Spark job to extract a pre-defined number of sentences containing links (from each article only the first sentence that contains at least one link ). The sentences are split into training and testing. This generates the following files: +- ```//training/sentences_train.csv``` Format: page_title \t sentence_wikitext \n +- ```//testing/sentences_test.csv``` Format: page_title \t sentence_wikitext \n + +#### generate_training_data.py +Parse the training sentences and transform into a training set of positive and negative examples of links with features. This generates the following files: +- ```//training/link_train.csv``` Format: page_title \t mention_text \t link_title \t feature_1 \t … \t feature_n \t label + +#### generate_addlink_model.py +Train a classifier-model using [XGBoost](https://xgboost.readthedocs.io) to predict links based on features. The script can combined any number of languge wikis to create a language agnostic model. This generates the following files: +- ```//.linkmodel.joblib``` contains parameters of the model, can be loaded via XGBoost. + +#### generate_backtesting_eval.py +Run backtesting evaluation of the link recommendation model on the test sentences. Output is precision and recall metrics for several values of the link-threshold. This generates the following files: +- ```//.backtest.eval.csv``` Format: index, threshold, number_of_sentences, precision, recall \n +- The numbers of precision and recall should not be too low. One can compare with the numbers reported in previous experiments on 10+ deployed wikis ([meta](https://meta.wikimedia.org/wiki/Research:Link_recommendation_model_for_add-a-link_structured_task#Third_set_of_results_(2021-06))). For the default threshold 0.5, the precision should be around 75% (or more) and the recall should not drop below 20% so there are still enough links to generate. diff --git a/training/add_a_link/src/add_a_link/__init__.py b/training/add_a_link/src/add_a_link/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/training/add_a_link/src/add_a_link/generate_anchor_dictionary.py b/training/add_a_link/src/add_a_link/generate_anchor_dictionary.py new file mode 100644 index 0000000000000000000000000000000000000000..9c2e978764fa5dba8b8ca98ac59ed70a01eda4b5 --- /dev/null +++ b/training/add_a_link/src/add_a_link/generate_anchor_dictionary.py @@ -0,0 +1,489 @@ +import collections +import functools +import hashlib +import logging +import pathlib +import pickle +import re +from collections.abc import Iterable, Iterator, Sequence +from dataclasses import dataclass +from typing import Any + +import click +import mwparserfromhell # type: ignore[import-untyped] +import pandas as pd +import rbloom +from mwtokenizer import Tokenizer # type: ignore[import-untyped] +from pyspark import SparkFiles +from pyspark.sql import DataFrame, SparkSession, functions as F, types as T + +from add_a_link import ngram_utils, utils +from add_a_link.pipeline_transformations.transformation import Transformation + +MIN_LINK_PROBABILITY = 0.065 +REFERENCE_RE = re.compile(r"]*>[^<]+<\/ref>") +LINK_RE = re.compile( + r"\[\[(?P[^\n\|\]\[\<\>\{\}]{0,256})(?:\|(?P[^\[]*?))?\]\]" +) + + +@dataclass(frozen=True) +class ParsedLink: + anchor: str + link: str + + +@functools.cache +def get_tokenizer(wiki_db: str) -> Tokenizer: + wiki_url = utils.get_wiki_url(wiki_db) + language_code = utils.get_language_code(wiki_url) + return Tokenizer(language_code) + + +@F.udf +def normalized_title(title: str) -> str: + return utils.normalise_title(title) + + +def _with_normalized_page_titles(mediawiki_wikitext_current_df: DataFrame) -> DataFrame: + return mediawiki_wikitext_current_df.withColumn( + colName="normalized_page_title", + col=normalized_title("page_title"), + ).withColumn( + colName="normalized_page_redirect_title", + col=normalized_title("page_redirect_title"), + ) + + +def _filter_empty_articles(mediawiki_wikitext_current_df: DataFrame) -> DataFrame: + return mediawiki_wikitext_current_df.filter( + (F.col("revision_text").isNotNull()) & (F.length(F.col("revision_text")) > 0) + ) + + +@F.udf( + returnType=T.ArrayType( + T.StructType( + [ + T.StructField(name="anchor", dataType=T.StringType()), + T.StructField(name="link", dataType=T.StringType()), + ] + ) + ) +) +def links(wikitext: str) -> list[ParsedLink]: + return [ + ParsedLink( + anchor=utils.normalise_anchor(anchor or normalized_link), + link=normalized_link, + ) + for link, anchor in LINK_RE.findall(wikitext) + if (normalized_link := utils.normalise_title(link)) + ] + + +def _with_resolved_links(redirects_df: DataFrame) -> Transformation: + def _(links_df: DataFrame) -> DataFrame: + resolved_redirects_df = links_df.alias("links").join( + redirects_df.alias("redirects"), + how="left", + on=( + (F.col("links.wiki_db") == F.col("redirects.wiki_db")) + & (F.col("links.link") == F.col("redirects.title_from")) + ), + ) + return resolved_redirects_df.select( + "links.*", + F.coalesce(F.col("redirects.title_to"), F.col("link")).alias( + "resolved_link" + ), + ) + + return _ + + +def _filter_unlinkable_anchors(articles_df: DataFrame) -> Transformation: + def _(links_df: DataFrame) -> DataFrame: + return ( + links_df.alias("links") + .join( + articles_df.alias("articles"), + how="leftsemi", + on=( + (F.col("links.wiki_db") == F.col("articles.wiki_db")) + & (F.col("links.link") == F.col("articles.title")) + ), + ) + .select( + "links.*", + ) + ) + + return _ + + +def _enrich_with_anchor_linked_frequency(links_df: DataFrame) -> DataFrame: + anchor_is_non_numeric = ( + "anchor not rlike '^[0-9]+$' and anchor not rlike '^[0-9]+[/-][0-9]+$'" + ) + linked_frequency_df = ( + links_df.where(F.length("anchor") > 0) + .where(anchor_is_non_numeric) + .groupBy("wiki_db", "anchor") + .agg(F.count("*").alias("linked_frequency")) + .where(F.col("linked_frequency") > 1) + ) + return linked_frequency_df + + +def compute_hash(obj: Any) -> int: + obj_hash = hashlib.sha256(pickle.dumps(obj), usedforsecurity=False).digest() + return int.from_bytes(obj_hash[:16], "big") - 2**127 + + +def make_bloom_filter( + max_items: int, items: Iterable[Any], output_path: pathlib.Path +) -> None: + bf = rbloom.Bloom( + expected_items=max_items, + false_positive_rate=0.01, + hash_func=compute_hash, + ) + bf.update(items) + bf.save(output_path) + + +def tokenize(tokenizer: Tokenizer, text: str) -> list[str]: + return [ + stripped_token + for token in tokenizer.sentence_tokenize(text, use_abbreviation=True) + if (stripped_token := token.strip()) + ] + + +@F.udf +def stripped_wikitext(wikitext: str) -> str: + stripped_wikitext = LINK_RE.sub(".", wikitext) + stripped_wikitext = REFERENCE_RE.sub(".", stripped_wikitext) + + try: + # Try to strip additional markup such as templates via mwparserfromhell. + stripped_wikitext = mwparserfromhell.parse(stripped_wikitext).strip_code() + except Exception as err: + logging.exception(err) + + return stripped_wikitext.lower() + + +def count_occurring_anchors( + text: str, wiki_db: str, anchors: rbloom.Bloom +) -> list[tuple[str, int]]: + tokenizer = get_tokenizer(wiki_db) + sentences = tokenize(tokenizer, text) + extracted_anchors: list[str] = [] + for sentence in sentences: + for anchor_ngram_count in range(10, 0, -1): + tokens = ngram_utils.get_tokens(sentence, tokenizer) + for ngram in ngram_utils.get_ngrams(tokens, anchor_ngram_count): + if (wiki_db, ngram) in anchors: + # NOTE: We are supposed to also replace found ngram in sentence + # so we don't catch overlapping anchors, but this implementation + # doesn't do that. These comments explain why: + # https://gitlab.wikimedia.org/repos/research/research-datasets/-/merge_requests/31#note_92027 + extracted_anchors.append(ngram) + + anchor_counts = collections.Counter(extracted_anchors).most_common() + return anchor_counts + + +def _enrich_with_anchor_frequency( + spark: SparkSession, anchors_file: pathlib.Path +) -> Transformation: + """Calculate frequency of all known anchors across articles texts.""" + spark.sparkContext.addFile(path=str(anchors_file)) + + anchor_frequency_schema = T.StructType( + [ + T.StructField(name="wiki_db", dataType=T.StringType()), + T.StructField(name="pid", dataType=T.IntegerType()), + T.StructField(name="anchor", dataType=T.StringType()), + T.StructField(name="anchor_frequency", dataType=T.IntegerType()), + ] + ) + + def anchor_frequency_udf( + articles_batches: Iterable[pd.DataFrame], + ) -> Iterator[pd.DataFrame]: + # Load the bloom filter containing all known anchors ahead of + # processing any batches so we only load it once. + anchors = rbloom.Bloom.load(SparkFiles.get(anchors_file.name), compute_hash) + + for batch in articles_batches: + batch["article_anchor_frequency"] = [ + count_occurring_anchors(text=wikitext, wiki_db=wiki_db, anchors=anchors) + for wiki_db, wikitext in zip(batch["wiki_db"], batch["wikitext"]) + ] + + # Explode article_anchor_frequency column into multiple rows. + df = batch.explode("article_anchor_frequency") + df = df.dropna(subset=("article_anchor_frequency",)) + + # Pop the exploded column and create a new dataframe from rows of tuples. + anchor_frequency_df = pd.DataFrame( + df.pop("article_anchor_frequency").tolist(), + columns=["anchor", "anchor_frequency"], + index=df.index, + ) + # Concatenate the original pid column to the new dataframe to + # get (pid, anchor, anchor_frequency) rows. + yield pd.concat([df[["wiki_db", "pid"]], anchor_frequency_df], axis=1) + + def _(articles_df: DataFrame) -> DataFrame: + anchor_frequency_df = ( + articles_df.select( + "pid", "wiki_db", stripped_wikitext("wikitext").alias("wikitext") + ) + .mapInPandas( + anchor_frequency_udf, # type: ignore[arg-type] + schema=anchor_frequency_schema, + ) + .groupBy("wiki_db", "anchor") + .agg(F.sum("anchor_frequency").alias("anchor_frequency")) + ) + return anchor_frequency_df + + return _ + + +def _with_link_probability( + linked_frequency_df: DataFrame, anchor_frequency_df: DataFrame +) -> Transformation: + def _(links_df: DataFrame) -> DataFrame: + link_probability_df = ( + linked_frequency_df.alias("lf") + .join( + anchor_frequency_df.alias("af"), + on=( + (F.col("lf.wiki_db") == F.col("af.wiki_db")) + & (F.col("lf.anchor") == F.col("af.anchor")) + ), + how="left", + ) + .select("lf.*", "af.anchor_frequency") + ) + link_probability_df = link_probability_df.fillna( + {"anchor_frequency": 0} + ).withColumn( + colName="link_probability", + col=F.col("linked_frequency") + / (F.col("anchor_frequency") + F.col("linked_frequency")), + ) + + return ( + links_df.alias("links") + .join( + link_probability_df.alias("lp"), + on=( + (F.col("links.wiki_db") == F.col("lp.wiki_db")) + & (F.col("links.anchor") == F.col("lp.anchor")) + ), + how="left", + ) + .select("links.*", "link_probability") + ) + + return _ + + +def _enrich_with_link_probability( + spark: SparkSession, articles_df: DataFrame, model_id +) -> Transformation: + def _(links_df: DataFrame) -> DataFrame: + linked_frequency_df = links_df.transform(_enrich_with_anchor_linked_frequency) + + # Bring all unique anchors back to the driver, *one partition at a time*, and + # add them to a bloom filter which can be later used to test if an arbitrary + # word is a known anchor. Bloom filters have a much smaller footprint than sets + # (~50 MiB vs ~3 GiB for 50M anchors) and a small number of false positives are + # not a problem since our usecase involves joining with the anchors dataframe + # afterwards, anyway. + anchors = ( + (row.wiki_db, row.anchor) + for row in linked_frequency_df.select("wiki_db", "anchor").toLocalIterator() + ) + anchors_path = pathlib.Path.cwd() / f"anchor_{model_id}.bloom" + make_bloom_filter( + max_items=50_000_000, + items=anchors, + output_path=anchors_path, + ) + + anchor_frequency_df = articles_df.transform( + _enrich_with_anchor_frequency(spark=spark, anchors_file=anchors_path) + ) + + link_probability_df = links_df.distinct().transform( + _with_link_probability( + linked_frequency_df=linked_frequency_df, + anchor_frequency_df=anchor_frequency_df, + ) + ) + return link_probability_df + + return _ + + +def _aggregate_link_occurrence_by_anchor(links_df: DataFrame) -> DataFrame: + return ( + links_df.groupBy("wiki_db", "anchor", "link") + .agg(F.count(F.col("title")).alias("n")) + .groupBy("wiki_db", "anchor") + .agg( + F.map_from_entries(F.collect_list(F.struct("link", "n"))).alias( + "link_occurrence" + ) + ) + ) + + +def enrich_with_articles(df, wiki_dbs: Sequence[str]) -> Transformation: + filtered_wikitext_df = ( + df.where(F.col("wiki_db").isin(*wiki_dbs)) + .where(F.col("page_namespace") == 0) + .transform(_filter_empty_articles) + .transform(_with_normalized_page_titles) + ) + articles_df = filtered_wikitext_df.where(F.col("page_redirect_title") == "").select( + F.col("wiki_db"), + F.col("page_id").alias("pid"), + F.col("normalized_page_title").alias("title"), + F.col("revision_text").alias("wikitext"), + ) + return articles_df + + +def enrich_with_redirects(df: DataFrame, wiki_dbs: Sequence[str]) -> Transformation: + filtered_wikitext_df = ( + df.where(F.col("wiki_db").isin(*wiki_dbs)) + .where(F.col("page_namespace") == 0) + .transform(_filter_empty_articles) + .transform(_with_normalized_page_titles) + ) + redirects_df = ( + filtered_wikitext_df.where(F.col("page_redirect_title") != "") + .select( + F.col("wiki_db"), + F.col("normalized_page_title").alias("title_from"), + F.col("normalized_page_redirect_title").alias("title_to"), + ) + .distinct() + ) + return redirects_df + + +def enrich_with_anchors( + df: DataFrame, spark: SparkSession, wiki_dbs: Sequence[str], model_id +) -> Transformation: + filtered_wikitext_df = ( + df.where(F.col("wiki_db").isin(*wiki_dbs)) + .where(F.col("page_namespace") == 0) + .transform(_filter_empty_articles) + .transform(_with_normalized_page_titles) + ) + articles_df = filtered_wikitext_df.where(F.col("page_redirect_title") == "").select( + F.col("wiki_db"), + F.col("page_id").alias("pid"), + F.col("normalized_page_title").alias("title"), + F.col("revision_text").alias("wikitext"), + ) + redirects_df = ( + filtered_wikitext_df.where(F.col("page_redirect_title") != "") + .select( + F.col("wiki_db"), + F.col("normalized_page_title").alias("title_from"), + F.col("normalized_page_redirect_title").alias("title_to"), + ) + .distinct() + ) + + anchors_df = ( + articles_df.withColumn("links", F.explode(links("wikitext"))) + .select("wiki_db", "pid", "title", "links.link", "links.anchor") + .transform(_with_resolved_links(redirects_df)) + .drop("link") + .withColumnRenamed("resolved_link", "link") + .transform(_filter_unlinkable_anchors(articles_df)) + .transform( + _enrich_with_link_probability( + spark=spark, articles_df=articles_df, model_id=model_id + ) + ) + .where(F.col("link_probability") >= MIN_LINK_PROBABILITY) + .transform(_aggregate_link_occurrence_by_anchor) + ) + + return anchors_df + + +def run( + spark: SparkSession, wiki_dbs: Sequence[str], output: pathlib.Path, model_id: str +) -> None: + revision_content_col = F.col("revision_content_slots")["main"]["content_body"] + mediawiki_wikitext_current = ( + spark.table("wmf_content.mediawiki_content_current_v1") + .withColumn("revision_text", revision_content_col) + .withColumnRenamed("wiki_id", "wiki_db") + .withColumnRenamed("page_namespace_id", "page_namespace") + .withColumn( + "page_redirect_target", + F.when(F.col("page_redirect_target").isNull(), F.lit("")).otherwise( + F.col("page_redirect_target") + ), + ) + .withColumnRenamed("page_redirect_target", "page_redirect_title") + ) + + ( + mediawiki_wikitext_current.transform( + lambda df: enrich_with_articles(df, wiki_dbs), + ) + .write.mode("overwrite") + .parquet(str(output / "pageids")) + ) + + ( + mediawiki_wikitext_current.transform( + lambda df: enrich_with_redirects(df, wiki_dbs), + ) + .write.mode("overwrite") + .parquet(str(output / "redirects")) + ) + + ( + mediawiki_wikitext_current.transform( + lambda df: enrich_with_anchors( + df, spark=spark, wiki_dbs=wiki_dbs, model_id=model_id + ), + ) + .write.mode("overwrite") + .parquet(str(output / "anchors")) + ) + + +@click.command() +@click.option("--wiki_dbs", required=True, help="Comma-separated list of names") +@click.option("--output", required=True) +@click.option("--model_id", required=True) +def main(wiki_dbs: str, output: str, model_id: str): + # will re-use previously created sessions if there is any e.g airflow + spark = SparkSession.builder.getOrCreate() + logging.info(f"wiki_dbs: {wiki_dbs}") + logging.info(f"output: {output}") + logging.info(f"model_id: {model_id}") + output_input = pathlib.Path(output) + wiki_dbs_input = wiki_dbs.split(",") + run(spark, wiki_dbs_input, output_input, model_id) + + +if __name__ == "__main__": + main() diff --git a/training/add_a_link/src/add_a_link/ngram_utils.py b/training/add_a_link/src/add_a_link/ngram_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bc3efad51ee13abafbee321d5bbdc6eede74ece0 --- /dev/null +++ b/training/add_a_link/src/add_a_link/ngram_utils.py @@ -0,0 +1,37 @@ +from collections.abc import Generator + +from mwtokenizer import Tokenizer # type: ignore[import-untyped] + + +def tokenize_sentence(text: str, tokenizer: Tokenizer) -> Generator[str, None, None]: + """split text into sentences. + - split by newlines because mwtokenizer does not split by newline + - extract sentences using mwtokenizer + """ + for line in text.split("\n"): + if line and len(line) > 0: + yield from tokenizer.sentence_tokenize(line, use_abbreviation=True) + + +def get_tokens(sent: str, tokenizer: Tokenizer) -> list[str]: + """tokenize a sentence. + e.g: "Berlin, Germany" tokenizes to ["Berlin", ",", " ", "Germany"] + """ + return list(tokenizer.word_tokenize(sent, use_abbreviation=True)) + + +def get_ngrams(tokens: list[str], n: int) -> Generator[str, None, None]: + """concatenate n non-whitespace tokens""" + for i_start, w_start in enumerate(tokens): + if w_start == " ": + continue + gram = "" + gram_count = 0 + for j in range(i_start, len(tokens)): + w = tokens[j] + gram += w + if w != " ": + gram_count += 1 + if gram_count == n: + yield gram + break diff --git a/training/add_a_link/src/add_a_link/notebooks/pipeline_run.ipynb b/training/add_a_link/src/add_a_link/notebooks/pipeline_run.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0bc188a506433472018243323181442cab8fc2b9 --- /dev/null +++ b/training/add_a_link/src/add_a_link/notebooks/pipeline_run.ipynb @@ -0,0 +1,1691 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "f68aa58e", + "metadata": {}, + "outputs": [], + "source": [ + "!export LD_LIBRARY_PATH=\"$CONDA_PREFIX/lib\"\n", + "\n", + "!export HADOOP_CONF_DIR=\"/etc/hadoop/conf\"\n", + "!export HADOOP_HOME=\"/usr/lib/hadoop\"\n", + "!export JAVA_HOME=$(dirname $(dirname $(readlink -f $(which javac))))\n", + "!export CLASSPATH=$CLASSPATH:`hadoop classpath --glob`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7d6412c6", + "metadata": { + "ExecuteTime": { + "end_time": "2025-07-17T12:36:00.343708Z", + "start_time": "2025-07-17T12:36:00.340221Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6937bd8d", + "metadata": { + "ExecuteTime": { + "end_time": "2025-07-17T12:36:01.288104Z", + "start_time": "2025-07-17T12:36:01.276366Z" + } + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "586923a7", + "metadata": {}, + "outputs": [], + "source": [ + "# from research_datasets.mwaddlink import ngram_utils, utils\n", + "import pathlib\n", + "\n", + "# from research_transformations.transformation import Transformation\n", + "# from research_datasets.mwaddlink.generate_anchor_dictionary import (\n", + "# ParsedLink,\n", + "# compute_hash,\n", + "# )\n", + "from pyspark.sql import SparkSession\n", + "\n", + "\n", + "# from research_datasets.mwaddlink import ngram_utils, utils\n", + "# from research_transformations.transformation import Transformation" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e08b2c3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SPARK_HOME: /usr/lib/spark3/\n", + "Using Hadoop client lib jars at 3.2.0, provided by Spark.\n", + "PYSPARK_DRIVER_PYTHON=python\n", + "PYSPARK_PYTHON=./env/bin/python\n", + ":: loading settings :: file = /etc/maven/ivysettings.xml\n", + ":: loading settings :: url = jar:file:/opt/conda-analytics/lib/python3.10/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /tmp/ivy_spark3/cache\n", + "The jars for the packages stored in: /tmp/ivy_spark3/home/jars\n", + "org.apache.spark#spark-avro_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-7e6bccb6-a412-4533-85f6-8430eaf0784e;1.0\n", + "\tconfs: [default]\n", + "\tfound org.apache.spark#spark-avro_2.12;3.1.2 in mirrored\n", + "\tfound org.spark-project.spark#unused;1.0.0 in mirrored\n", + ":: resolution report :: resolve 137ms :: artifacts dl 4ms\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 2 | 0 | 0 | 0 || 2 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-7e6bccb6-a412-4533-85f6-8430eaf0784e\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 2 already retrieved (0kB/4ms)\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "25/07/18 09:45:18 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12002. Attempting port 12003.\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12003. Attempting port 12004.\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12004. Attempting port 12005.\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12005. Attempting port 12006.\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12006. Attempting port 12007.\n", + "25/07/18 09:45:18 WARN Utils: Service 'sparkDriver' could not bind on port 12007. Attempting port 12008.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.\n", + "25/07/18 09:45:19 WARN Utils: Service 'SparkUI' could not bind on port 4047. Attempting port 4048.\n", + "25/07/18 09:45:31 WARN Client: Same path resource file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar added multiple times to distributed cache.\n", + "25/07/18 09:45:31 WARN Client: Same path resource file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar added multiple times to distributed cache.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13000. Attempting port 13001.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13001. Attempting port 13002.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13002. Attempting port 13003.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13003. Attempting port 13004.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13004. Attempting port 13005.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13005. Attempting port 13006.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13006. Attempting port 13007.\n", + "25/07/18 09:45:56 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 13007. Attempting port 13008.\n", + "25/07/18 09:45:56 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!\n" + ] + } + ], + "source": [ + "from research_common.spark import create_yarn_spark_session\n", + "\n", + "# spark.stop()\n", + "spark = create_yarn_spark_session(\n", + " app_id=\"interactive\",\n", + " gitlab_project=\"ozge/ml-pipelines\",\n", + " extra_config={\n", + " \"spark.dynamicAllocation.maxExecutors\": 119,\n", + " \"spark.executor.cores\": 4,\n", + " \"spark.sql.adaptive.enabled\": \"true\",\n", + " \"spark.sql.shuffle.partitions\": 1000,\n", + " \"spark.driver.memory\": \"32G\",\n", + " \"spark.driver.maxResultSize\": \"40G\",\n", + " \"spark.executor.memory\": \"32g\",\n", + " \"spark.executor.memoryOverhead\": \"4g\",\n", + " \"spark.network.timeout\": \"1200s\",\n", + " \"spark.shuffle.io.retryWait\": \"1200s\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2f8892c9", + "metadata": {}, + "outputs": [], + "source": [ + "from add_a_link import generate_anchor_dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "13a242ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "spark.eventLog.enabled = true\n", + "spark.eventLog.dir = hdfs:///var/log/spark\n", + "spark.network.crypto.keyLength = 256\n", + "spark.sql.shuffle.partitions = 1000\n", + "spark.network.crypto.enabled = true\n", + "spark.sql.warehouse.dir = hdfs:///user/hive/warehouse\n", + "spark.network.crypto.keyFactoryAlgorithm = PBKDF2WithHmacSHA256\n", + "spark.pyspark.driver.python = python\n", + "spark.sql.catalog.spark_catalog = org.apache.iceberg.spark.SparkSessionCatalog\n", + "spark.driver.blockManager.port = 13000\n", + "spark.yarn.appMasterEnv.REQUESTS_CA_BUNDLE = /etc/ssl/certs/ca-certificates.crt\n", + "spark.yarn.archive = hdfs:///user/spark/share/lib/spark-3.1.2-assembly.jar\n", + "spark.network.timeout = 1200s\n", + "spark.pyspark.python = ./env/bin/python\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS = an-master1003.eqiad.wmnet,an-master1004.eqiad.wmnet\n", + "spark.jars.packages = org.apache.spark:spark-avro_2.12:3.1.2\n", + "spark.executorEnv.PYTHONPATH = /etc/jupyterhub-conda{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/org.apache.spark_spark-avro_2.12-3.1.2.jar{{PWD}}/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.shuffle.io.maxRetries = 10\n", + "spark.local.dir = /srv/spark-tmp\n", + "spark.ui.filters = org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter\n", + "spark.sql.execution.arrow.pyspark.enabled = true\n", + "spark.executor.defaultJavaOptions = -Djava.net.useSystemProxies=True\n", + "spark.app.name = interactive\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES = http://an-master1003.eqiad.wmnet:8088/proxy/application_1750705250302_776086,http://an-master1004.eqiad.wmnet:8088/proxy/application_1750705250302_776086\n", + "spark.executor.id = driver\n", + "spark.yarn.dist.archives = https://gitlab.wikimedia.org/api/v4/projects/3455/packages/generic/add_a_link/0.1.1/add_a_link-0.1.1.conda.tgz#env\n", + "spark.sql.extensions = org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\n", + "spark.driver.appUIAddress = http://stat1010.eqiad.wmnet:4048\n", + "spark.executor.memory = 32g\n", + "spark.driver.maxResultSize = 40G\n", + "spark.sql.catalogImplementation = hive\n", + "spark.driver.host = stat1010.eqiad.wmnet\n", + "spark.dynamicAllocation.cachedExecutorIdleTimeout = 3600s\n", + "spark.network.crypto.saslFallback = false\n", + "spark.shuffle.io.retryWait = 1200s\n", + "spark.driver.port = 12008\n", + "spark.authenticate = true\n", + "spark.executor.memoryOverhead = 4g\n", + "spark.port.maxRetries = 100\n", + "spark.driver.extraJavaOptions = -Divy.cache.dir=/tmp/ivy_spark3/cache -Divy.home=/tmp/ivy_spark3/home \n", + "spark.app.id = application_1750705250302_776086\n", + "spark.dynamicAllocation.maxExecutors = 119\n", + "spark.driver.defaultJavaOptions = -Djava.net.useSystemProxies=True\n", + "spark.repl.local.jars = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.submit.pyFiles = /tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,/tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.ui.port = 4040\n", + "spark.executor.cores = 4\n", + "spark.yarn.dist.pyFiles = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.sql.files.maxPartitionBytes = 268435456\n", + "spark.executorEnv.REQUESTS_CA_BUNDLE = /etc/ssl/certs/ca-certificates.crt\n", + "spark.serializer.objectStreamReset = 100\n", + "spark.dynamicAllocation.executorIdleTimeout = 60s\n", + "spark.submit.deployMode = client\n", + "spark.app.startTime = 1752831918368\n", + "spark.yarn.secondary.jars = org.apache.spark_spark-avro_2.12-3.1.2.jar,org.spark-project.spark_unused-1.0.0.jar\n", + "spark.yarn.dist.jars = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.shuffle.service.enabled = true\n", + "spark.executorEnv.LD_LIBRARY_PATH = /usr/lib/hadoop/lib/native\n", + "spark.sql.adaptive.enabled = true\n", + "spark.jars.ivySettings = /etc/maven/ivysettings.xml\n", + "spark.yarn.historyServer.address = yarn.wikimedia.org\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.RM_HA_URLS = an-master1003.eqiad.wmnet:8088,an-master1004.eqiad.wmnet:8088\n", + "spark.master = yarn\n", + "spark.rdd.compress = True\n", + "spark.eventLog.compress = true\n", + "spark.yarn.isPython = true\n", + "spark.dynamicAllocation.enabled = true\n", + "spark.driver.memory = 32G\n", + "spark.ui.proxyBase = /proxy/application_1750705250302_776086\n", + "spark.sql.catalog.spark_catalog.type = hive\n", + "spark.ui.showConsoleProgress = true\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m model_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrwiki\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m output \u001b[38;5;241m=\u001b[39m pathlib\u001b[38;5;241m.\u001b[39mPath(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/user/ozge/test_addalink_ml/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m \u001b[43mgenerate_anchor_dictionary\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mspark\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mmodel_id\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshard_1\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/srv/home/ozge/repos/wiki/gitlab/ozge/ml-pipelines/add_a_link/src/add_a_link/generate_anchor_dictionary.py:460\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(spark, wiki_dbs, output, model_id)\u001b[0m\n\u001b[1;32m 442\u001b[0m revision_content_col \u001b[38;5;241m=\u001b[39m F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrevision_content_slots\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmain\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent_body\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 443\u001b[0m mediawiki_wikitext_current \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 444\u001b[0m spark\u001b[38;5;241m.\u001b[39mtable(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwmf_content.mediawiki_content_current_v1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 445\u001b[0m \u001b[38;5;241m.\u001b[39mwithColumn(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrevision_text\u001b[39m\u001b[38;5;124m\"\u001b[39m, revision_content_col)\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 452\u001b[0m \u001b[38;5;241m.\u001b[39mwithColumnRenamed(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpage_redirect_target\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpage_redirect_title\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 453\u001b[0m )\n\u001b[1;32m 455\u001b[0m (\n\u001b[1;32m 456\u001b[0m \u001b[43mmediawiki_wikitext_current\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 457\u001b[0m \u001b[43m \u001b[49m\u001b[43menrich_with_articles\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwiki_dbs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 458\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 459\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moverwrite\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m--> 460\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpageids\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 461\u001b[0m )\n\u001b[1;32m 463\u001b[0m (\n\u001b[1;32m 464\u001b[0m mediawiki_wikitext_current\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[1;32m 465\u001b[0m enrich_with_redirects(wiki_dbs),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 468\u001b[0m \u001b[38;5;241m.\u001b[39mparquet(\u001b[38;5;28mstr\u001b[39m(output \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mredirects\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 469\u001b[0m )\n\u001b[1;32m 471\u001b[0m (\n\u001b[1;32m 472\u001b[0m mediawiki_wikitext_current\u001b[38;5;241m.\u001b[39mtransform(\n\u001b[1;32m 473\u001b[0m enrich_with_anchors(spark\u001b[38;5;241m=\u001b[39mspark, wiki_dbs\u001b[38;5;241m=\u001b[39mwiki_dbs, model_id\u001b[38;5;241m=\u001b[39mmodel_id),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;241m.\u001b[39mparquet(\u001b[38;5;28mstr\u001b[39m(output \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124manchors\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 477\u001b[0m )\n", + "File \u001b[0;32m~/.conda/envs/add_a_link/lib/python3.10/site-packages/pyspark/sql/readwriter.py:1250\u001b[0m, in \u001b[0;36mDataFrameWriter.parquet\u001b[0;34m(self, path, mode, partitionBy, compression)\u001b[0m\n\u001b[1;32m 1248\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpartitionBy(partitionBy)\n\u001b[1;32m 1249\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_opts(compression\u001b[38;5;241m=\u001b[39mcompression)\n\u001b[0;32m-> 1250\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jwrite\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparquet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/add_a_link/lib/python3.10/site-packages/py4j/java_gateway.py:1303\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1296\u001b[0m args_command, temp_args \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_build_args(\u001b[38;5;241m*\u001b[39margs)\n\u001b[1;32m 1298\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1300\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1301\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[0;32m-> 1303\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend_command\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1304\u001b[0m return_value \u001b[38;5;241m=\u001b[39m get_return_value(\n\u001b[1;32m 1305\u001b[0m answer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget_id, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n", + "File \u001b[0;32m~/.conda/envs/add_a_link/lib/python3.10/site-packages/py4j/java_gateway.py:1033\u001b[0m, in \u001b[0;36mGatewayClient.send_command\u001b[0;34m(self, command, retry, binary)\u001b[0m\n\u001b[1;32m 1031\u001b[0m connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_connection()\n\u001b[1;32m 1032\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1033\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend_command\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1034\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m binary:\n\u001b[1;32m 1035\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_create_connection_guard(connection)\n", + "File \u001b[0;32m~/.conda/envs/add_a_link/lib/python3.10/site-packages/py4j/java_gateway.py:1200\u001b[0m, in \u001b[0;36mGatewayConnection.send_command\u001b[0;34m(self, command)\u001b[0m\n\u001b[1;32m 1196\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Py4JNetworkError(\n\u001b[1;32m 1197\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError while sending\u001b[39m\u001b[38;5;124m\"\u001b[39m, e, proto\u001b[38;5;241m.\u001b[39mERROR_ON_SEND)\n\u001b[1;32m 1199\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1200\u001b[0m answer \u001b[38;5;241m=\u001b[39m smart_decode(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstream\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m 1201\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAnswer received: \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(answer))\n\u001b[1;32m 1202\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m answer\u001b[38;5;241m.\u001b[39mstartswith(proto\u001b[38;5;241m.\u001b[39mRETURN_MESSAGE):\n", + "File \u001b[0;32m~/.conda/envs/add_a_link/lib/python3.10/socket.py:717\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 716\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 717\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "model_id = \"trwiki\"\n", + "output = pathlib.Path(f\"/user/ozge/test_addalink_ml/{model_id}\")\n", + "generate_anchor_dictionary.run(spark, [model_id], output, \"shard_1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "99e21fd9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "479071\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wiki_dbanchorlink_occurrence
0trwiki\"angel\" (mika newton şarkısı){'Angel (Mika Newton şarkısı)': 2}
1trwiki\"das reich\" tümeni{'2. Waffen-SS Panzer Tümeni Das Reich': 2}
2trwiki\"hangman\" adam page{'Adam Page': 3}
3trwiki\"i'll be there for you\"{'I'll Be There for You (The Rembrandts şarkıs...
4trwiki\"leningrad'ın savunmasına göre\" madalyası{'Leningrad Savunması Madalyası': 2}
\n", + "
" + ], + "text/plain": [ + " wiki_db anchor \\\n", + "0 trwiki \"angel\" (mika newton şarkısı) \n", + "1 trwiki \"das reich\" tümeni \n", + "2 trwiki \"hangman\" adam page \n", + "3 trwiki \"i'll be there for you\" \n", + "4 trwiki \"leningrad'ın savunmasına göre\" madalyası \n", + "\n", + " link_occurrence \n", + "0 {'Angel (Mika Newton şarkısı)': 2} \n", + "1 {'2. Waffen-SS Panzer Tümeni Das Reich': 2} \n", + "2 {'Adam Page': 3} \n", + "3 {'I'll Be There for You (The Rembrandts şarkıs... \n", + "4 {'Leningrad Savunması Madalyası': 2} " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_anchors = spark.read.parquet(\"/user/ozge/test_addalink_ml/trwiki/anchors\")\n", + "print(df_anchors.count())\n", + "df_anchors.limit(5).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bda889b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'40G'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "spark2.conf.get(\"spark.driver.maxResultSize\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0c1ca96d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "spark.eventLog.enabled = true\n", + "spark.eventLog.dir = hdfs:///var/log/spark\n", + "spark.network.crypto.keyLength = 256\n", + "spark.sql.shuffle.partitions = 1000\n", + "spark.network.crypto.enabled = true\n", + "spark.driver.port = 12007\n", + "spark.sql.warehouse.dir = hdfs:///user/hive/warehouse\n", + "spark.network.crypto.keyFactoryAlgorithm = PBKDF2WithHmacSHA256\n", + "spark.pyspark.driver.python = python\n", + "spark.sql.catalog.spark_catalog = org.apache.iceberg.spark.SparkSessionCatalog\n", + "spark.driver.blockManager.port = 13000\n", + "spark.yarn.appMasterEnv.REQUESTS_CA_BUNDLE = /etc/ssl/certs/ca-certificates.crt\n", + "spark.yarn.archive = hdfs:///user/spark/share/lib/spark-3.1.2-assembly.jar\n", + "spark.network.timeout = 1200s\n", + "spark.pyspark.python = ./env/bin/python\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS = an-master1003.eqiad.wmnet,an-master1004.eqiad.wmnet\n", + "spark.jars.packages = org.apache.spark:spark-avro_2.12:3.1.2\n", + "spark.executorEnv.PYTHONPATH = /etc/jupyterhub-conda{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/org.apache.spark_spark-avro_2.12-3.1.2.jar{{PWD}}/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.shuffle.io.maxRetries = 10\n", + "spark.local.dir = /srv/spark-tmp\n", + "spark.ui.filters = org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter\n", + "spark.sql.execution.arrow.pyspark.enabled = true\n", + "spark.executor.defaultJavaOptions = -Djava.net.useSystemProxies=True\n", + "spark.app.name = interactive\n", + "spark.executor.id = driver\n", + "spark.yarn.dist.archives = https://gitlab.wikimedia.org/api/v4/projects/3455/packages/generic/add_a_link/0.1.1/add_a_link-0.1.1.conda.tgz#env\n", + "spark.app.startTime = 1752825909557\n", + "spark.sql.extensions = org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\n", + "spark.executor.memory = 32g\n", + "spark.driver.maxResultSize = 40G\n", + "spark.sql.catalogImplementation = hive\n", + "spark.driver.host = stat1010.eqiad.wmnet\n", + "spark.dynamicAllocation.cachedExecutorIdleTimeout = 3600s\n", + "spark.network.crypto.saslFallback = false\n", + "spark.shuffle.io.retryWait = 1200s\n", + "spark.authenticate = true\n", + "spark.executor.memoryOverhead = 4g\n", + "spark.port.maxRetries = 100\n", + "spark.driver.extraJavaOptions = -Divy.cache.dir=/tmp/ivy_spark3/cache -Divy.home=/tmp/ivy_spark3/home \n", + "spark.dynamicAllocation.maxExecutors = 119\n", + "spark.driver.defaultJavaOptions = -Djava.net.useSystemProxies=True\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES = http://an-master1003.eqiad.wmnet:8088/proxy/application_1750705250302_774527,http://an-master1004.eqiad.wmnet:8088/proxy/application_1750705250302_774527\n", + "spark.repl.local.jars = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.submit.pyFiles = /tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,/tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.ui.port = 4040\n", + "spark.executor.cores = 4\n", + "spark.yarn.dist.pyFiles = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.sql.files.maxPartitionBytes = 268435456\n", + "spark.executorEnv.REQUESTS_CA_BUNDLE = /etc/ssl/certs/ca-certificates.crt\n", + "spark.serializer.objectStreamReset = 100\n", + "spark.dynamicAllocation.executorIdleTimeout = 60s\n", + "spark.submit.deployMode = client\n", + "spark.yarn.secondary.jars = org.apache.spark_spark-avro_2.12-3.1.2.jar,org.spark-project.spark_unused-1.0.0.jar\n", + "spark.yarn.dist.jars = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.shuffle.service.enabled = true\n", + "spark.executorEnv.LD_LIBRARY_PATH = /usr/lib/hadoop/lib/native\n", + "spark.sql.adaptive.enabled = true\n", + "spark.jars.ivySettings = /etc/maven/ivysettings.xml\n", + "spark.yarn.historyServer.address = yarn.wikimedia.org\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.RM_HA_URLS = an-master1003.eqiad.wmnet:8088,an-master1004.eqiad.wmnet:8088\n", + "spark.ui.proxyBase = /proxy/application_1750705250302_774527\n", + "spark.master = yarn\n", + "spark.driver.appUIAddress = http://stat1010.eqiad.wmnet:4047\n", + "spark.rdd.compress = True\n", + "spark.eventLog.compress = true\n", + "spark.yarn.isPython = true\n", + "spark.dynamicAllocation.enabled = true\n", + "spark.driver.memory = 32G\n", + "spark.app.id = application_1750705250302_774527\n", + "spark.ui.showConsoleProgress = true\n", + "spark.sql.catalog.spark_catalog.type = hive\n" + ] + } + ], + "source": [ + "for item in spark.sparkContext.getConf().getAll():\n", + " print(f\"{item[0]} = {item[1]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a0dbacb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56506 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m18.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679055_1700073530, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56506 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56516 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56530 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m20.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679052_1700073527, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56516 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m8.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679036_1700073511, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56530 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56564 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m15.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679059_1700073534, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56564 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56538 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56542 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m19.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679050_1700073525, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56538 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m35.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679053_1700073528, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56542 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56556 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m32.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679042_1700073517, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56556 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56588 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m5.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679029_1700073504, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56588 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56560 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m38.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679062_1700073537, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56560 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56580 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56590 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m6.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679054_1700073529, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56580 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m37.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679056_1700073531, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56590 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN BlockReaderFactory: I/O error constructing remote block reader.\n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56582 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "25/07/18 09:54:14 WARN DFSClient: Failed to connect to /10.64.161.6:50010 for file /wmf/data/wmf_content/mediawiki_content_current_v1/metadata/7be98844-2cff-42ef-9cf2-b37599179c73-m23.avro for block BP-1552854784-10.64.21.110-1405114489661:blk_2773679051_1700073526, add to deadNodes and continue. \n", + "java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.64.138.6:56582 remote=/10.64.161.6:50010]\n", + "\tat org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)\n", + "\tat org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:184)\n", + "\tat org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:658)\n", + "\tat java.io.FilterInputStream.read(FilterInputStream.java:83)\n", + "\tat org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)\n", + "\tat org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)\n", + "\tat org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:829)\n", + "\tat java.io.DataInputStream.read(DataInputStream.java:149)\n", + "\tat org.apache.iceberg.hadoop.HadoopStreams$HadoopSeekableInputStream.read(HadoopStreams.java:123)\n", + "\tat org.apache.iceberg.avro.AvroIO$AvroInputStreamAdapter.read(AvroIO.java:117)\n", + "\tat org.apache.iceberg.shaded.org.apache.avro.file.DataFileReader.openReader(DataFileReader.java:65)\n", + "\tat org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)\n", + "\tat org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7$1.(CloseableIterable.java:188)\n", + "\tat org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)\n", + "\tat org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)\n", + "\tat org.apache.iceberg.DeleteFileIndex$Builder.lambda$build$0(DeleteFileIndex.java:425)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.runTaskWithRetry(Tasks.java:413)\n", + "\tat org.apache.iceberg.util.Tasks$Builder.access$300(Tasks.java:69)\n", + "\tat org.apache.iceberg.util.Tasks$Builder$1.run(Tasks.java:315)\n", + "\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n", + "\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n", + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", + "\tat java.lang.Thread.run(Thread.java:750)\n", + "[Stage 0:> (0 + 0) / 20]\r" + ] + } + ], + "source": [ + "spark2 = SparkSession.builder.getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2c336c2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "spark.eventLog.enabled = true\n", + "spark.eventLog.dir = hdfs:///var/log/spark\n", + "spark.network.crypto.keyLength = 256\n", + "spark.sql.shuffle.partitions = 1000\n", + "spark.network.crypto.enabled = true\n", + "spark.sql.warehouse.dir = hdfs:///user/hive/warehouse\n", + "spark.network.crypto.keyFactoryAlgorithm = PBKDF2WithHmacSHA256\n", + "spark.pyspark.driver.python = python\n", + "spark.sql.catalog.spark_catalog = org.apache.iceberg.spark.SparkSessionCatalog\n", + "spark.driver.blockManager.port = 13000\n", + "spark.yarn.appMasterEnv.REQUESTS_CA_BUNDLE = /etc/ssl/certs/ca-certificates.crt\n", + "spark.yarn.archive = hdfs:///user/spark/share/lib/spark-3.1.2-assembly.jar\n", + "spark.network.timeout = 1200s\n", + "spark.pyspark.python = ./env/bin/python\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS = an-master1003.eqiad.wmnet,an-master1004.eqiad.wmnet\n", + "spark.jars.packages = org.apache.spark:spark-avro_2.12:3.1.2\n", + "spark.executorEnv.PYTHONPATH = /etc/jupyterhub-conda{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/org.apache.spark_spark-avro_2.12-3.1.2.jar{{PWD}}/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.shuffle.io.maxRetries = 10\n", + "spark.local.dir = /srv/spark-tmp\n", + "spark.ui.filters = org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter\n", + "spark.sql.execution.arrow.pyspark.enabled = true\n", + "spark.executor.defaultJavaOptions = -Djava.net.useSystemProxies=True\n", + "spark.app.name = interactive\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES = http://an-master1003.eqiad.wmnet:8088/proxy/application_1750705250302_776086,http://an-master1004.eqiad.wmnet:8088/proxy/application_1750705250302_776086\n", + "spark.executor.id = driver\n", + "spark.yarn.dist.archives = https://gitlab.wikimedia.org/api/v4/projects/3455/packages/generic/add_a_link/0.1.1/add_a_link-0.1.1.conda.tgz#env\n", + "spark.sql.extensions = org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\n", + "spark.driver.appUIAddress = http://stat1010.eqiad.wmnet:4048\n", + "spark.executor.memory = 32g\n", + "spark.driver.maxResultSize = 40G\n", + "spark.sql.catalogImplementation = hive\n", + "spark.driver.host = stat1010.eqiad.wmnet\n", + "spark.dynamicAllocation.cachedExecutorIdleTimeout = 3600s\n", + "spark.network.crypto.saslFallback = false\n", + "spark.shuffle.io.retryWait = 1200s\n", + "spark.driver.port = 12008\n", + "spark.authenticate = true\n", + "spark.executor.memoryOverhead = 4g\n", + "spark.port.maxRetries = 100\n", + "spark.driver.extraJavaOptions = -Divy.cache.dir=/tmp/ivy_spark3/cache -Divy.home=/tmp/ivy_spark3/home \n", + "spark.app.id = application_1750705250302_776086\n", + "spark.dynamicAllocation.maxExecutors = 119\n", + "spark.driver.defaultJavaOptions = -Djava.net.useSystemProxies=True\n", + "spark.repl.local.jars = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.submit.pyFiles = /tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,/tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.ui.port = 4040\n", + "spark.executor.cores = 4\n", + "spark.yarn.dist.pyFiles = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.sql.files.maxPartitionBytes = 268435456\n", + "spark.executorEnv.REQUESTS_CA_BUNDLE = /etc/ssl/certs/ca-certificates.crt\n", + "spark.serializer.objectStreamReset = 100\n", + "spark.dynamicAllocation.executorIdleTimeout = 60s\n", + "spark.submit.deployMode = client\n", + "spark.app.startTime = 1752831918368\n", + "spark.yarn.secondary.jars = org.apache.spark_spark-avro_2.12-3.1.2.jar,org.spark-project.spark_unused-1.0.0.jar\n", + "spark.yarn.dist.jars = file:///tmp/ivy_spark3/home/jars/org.apache.spark_spark-avro_2.12-3.1.2.jar,file:///tmp/ivy_spark3/home/jars/org.spark-project.spark_unused-1.0.0.jar\n", + "spark.shuffle.service.enabled = true\n", + "spark.executorEnv.LD_LIBRARY_PATH = /usr/lib/hadoop/lib/native\n", + "spark.sql.adaptive.enabled = true\n", + "spark.jars.ivySettings = /etc/maven/ivysettings.xml\n", + "spark.yarn.historyServer.address = yarn.wikimedia.org\n", + "spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.RM_HA_URLS = an-master1003.eqiad.wmnet:8088,an-master1004.eqiad.wmnet:8088\n", + "spark.master = yarn\n", + "spark.rdd.compress = True\n", + "spark.eventLog.compress = true\n", + "spark.yarn.isPython = true\n", + "spark.dynamicAllocation.enabled = true\n", + "spark.driver.memory = 32G\n", + "spark.ui.proxyBase = /proxy/application_1750705250302_776086\n", + "spark.sql.catalog.spark_catalog.type = hive\n", + "spark.ui.showConsoleProgress = true\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/07/18 09:54:32 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources\n" + ] + } + ], + "source": [ + "for item in spark2.sparkContext.getConf().getAll():\n", + " print(f\"{item[0]} = {item[1]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "67923e45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 3 items\n", + "drwxr-x--- - ozge ozge 0 2025-07-18 13:15 /user/ozge/test_addalink_ml_airflow/trwiki/anchors\n", + "drwxr-x--- - ozge ozge 0 2025-07-18 13:00 /user/ozge/test_addalink_ml_airflow/trwiki/pageids\n", + "drwxr-x--- - ozge ozge 0 2025-07-18 13:00 /user/ozge/test_addalink_ml_airflow/trwiki/redirects\n" + ] + } + ], + "source": [ + "!hdfs dfs -ls /user/ozge/test_addalink_ml_airflow/trwiki" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "722b73aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "25/07/18 10:54:57 INFO fs.TrashPolicyDefault: Moved: 'hdfs://analytics-hadoop/wmf/cache/artifacts/airflow/ml/add_a_link-0.1.1.conda.tgz' to trash at: hdfs://analytics-hadoop/user/ozge/.Trash/Current/wmf/cache/artifacts/airflow/ml/add_a_link-0.1.1.conda.tgz\n" + ] + } + ], + "source": [ + "# !hdfs dfs -rm /wmf/cache/artifacts/airflow/ml/add_a_link-0.1.1.conda.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "573c2d68", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "479071\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wiki_dbanchorlink_occurrence
0trwiki\"i quit\" maçı{'\"I Quit\" maçı': 25}
1trwiki\"suriye\" ve \"asur\" isimleri arasındaki etimolo...{'Suriye'nin adı': 2}
2trwiki'''fatih karagümrük'''{'Fatih Karagümrük SK': 3}
3trwiki'''ulubey'''{'Ulubey, Ordu': 2}
4trwiki''39 basamak''{'39 Basamak (film, 1935)': 3}
\n", + "
" + ], + "text/plain": [ + " wiki_db anchor \\\n", + "0 trwiki \"i quit\" maçı \n", + "1 trwiki \"suriye\" ve \"asur\" isimleri arasındaki etimolo... \n", + "2 trwiki '''fatih karagümrük''' \n", + "3 trwiki '''ulubey''' \n", + "4 trwiki ''39 basamak'' \n", + "\n", + " link_occurrence \n", + "0 {'\"I Quit\" maçı': 25} \n", + "1 {'Suriye'nin adı': 2} \n", + "2 {'Fatih Karagümrük SK': 3} \n", + "3 {'Ulubey, Ordu': 2} \n", + "4 {'39 Basamak (film, 1935)': 3} " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_anchors = spark.read.parquet(\"/user/ozge/test_addalink_ml_airflow/trwiki/anchors\")\n", + "print(df_anchors.count())\n", + "df_anchors.limit(5).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5aab157", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "add_a_link_remote", + "language": "python", + "name": "add_a_link_remote" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/training/add_a_link/src/add_a_link/pipeline_transformations/__init__.py b/training/add_a_link/src/add_a_link/pipeline_transformations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/training/add_a_link/src/add_a_link/pipeline_transformations/evaluation.py b/training/add_a_link/src/add_a_link/pipeline_transformations/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..417739467bfc37254ebeff18f75cba790c325473 --- /dev/null +++ b/training/add_a_link/src/add_a_link/pipeline_transformations/evaluation.py @@ -0,0 +1,184 @@ +from collections.abc import Iterable + +import pandas as pd +from pyspark.sql import Column as SparkColumn, DataFrame, functions as F, types as T +from sklearn.metrics import auc # type: ignore + +from add_a_link.pipeline_transformations.transformation import Transformation + + +def binary_classifier_metrics( + pred_col: SparkColumn, + label_col: SparkColumn, + grouping_cols: Iterable[SparkColumn] = (), + thresholds: Iterable[int] = range(100), +) -> Transformation: + """For a predictions from a binary classifier, + compute precision, recall, accuracy, fpr, F1, auc + for a series of threshold candidates. The auc + column is the same for all thresholds of a + group. + + pred_col should be a float column + label_col should be a bool column + """ + + metrics = ["precision", "recall", "accuracy", "fpr", "F1"] + + def conditional_count(condition: SparkColumn) -> SparkColumn: + return F.sum( + F.when( + condition, + 1, + ).otherwise(0) + ) + + # define the necessary aggregations in a list that + # will be computed in a single reduce step + agg_cols = [] + for pth in thresholds: + th = pth / 100 + agg_cols.extend( + [ + conditional_count(label_col & (pred_col >= th)).alias(f"tp_{pth}"), + conditional_count(~label_col & (pred_col >= th)).alias(f"fp_{pth}"), + conditional_count(~label_col & (pred_col < th)).alias(f"tn_{pth}"), + conditional_count(label_col & (pred_col < th)).alias(f"fn_{pth}"), + ] + ) + + # calculate the statitics + stats_cols = [] + for th in thresholds: + stats_cols.extend( + [ + # precision = tp / (tp + fp) + (F.col(f"tp_{th}") / (F.col(f"tp_{th}") + F.col(f"fp_{th}"))).alias( + f"precision_{th}" + ), + # recall = tp / (tp + fn) + (F.col(f"tp_{th}") / (F.col(f"tp_{th}") + F.col(f"fn_{th}"))).alias( + f"recall_{th}" + ), + # accuracy = (tp + tn) / total + ((F.col(f"tp_{th}") + F.col(f"tn_{th}")) / F.col("total")).alias( + f"accuracy_{th}" + ), + # fpr = fp / (fp + tn) + (F.col(f"fp_{th}") / (F.col(f"fp_{th}") + F.col(f"tn_{th}"))).alias( + f"fpr_{th}" + ), + ] + ) + + # the F1 score is calculated based on other statistics, so has to be + # done in a second step + f1_cols = [] + for th in thresholds: + f1_cols.extend( + [ + # F1 = 2 * precision * recall / (precision + recall) + ( + (2 * F.col(f"precision_{th}") * F.col(f"recall_{th}")) + / (F.col(f"precision_{th}") + F.col(f"recall_{th}")) + ).alias(f"F1_{th}") + ] + ) + + def transform(predictions_df: DataFrame) -> DataFrame: + # aggregate all the needed counts + counts = predictions_df.groupBy(*grouping_cols).agg( + *agg_cols, F.count("*").alias("total") + ) + stats_df = counts.select(*grouping_cols, *stats_cols) + stats_df = stats_df.select(*stats_df.columns, *f1_cols) + + # stats contains all the statistics for a given grouping in a single row + # e.g. there are 502 columns - 100 thresholds * 5 statistics + grouping_cols + + # to create the shape of the dataframe as currently specified, we explode the + # thresholds so that they are in different rows + + thresholds_structs_df = stats_df.select( + *grouping_cols, + *[ + F.struct( + F.lit(th).alias("threshold"), + *[F.col(f"{m}_{th}").alias(m) for m in metrics], + ).alias(f"threshold_{th}") + for th in thresholds + ], + ) + + metrics_df = thresholds_structs_df.select( + *grouping_cols, + F.explode(F.array(*[F.col(f"threshold_{th}") for th in thresholds])).alias( + "thresholds_stats" + ), + ).select(*grouping_cols, "thresholds_stats.*") + + auc_df = metrics_df.transform( + binary_classifier_auc(grouping_cols=grouping_cols) + ) + + if grouping_cols: + return metrics_df.join( + auc_df, + on=[c._jc.toString() for c in grouping_cols], # type: ignore + how="left", + ) + else: + return metrics_df.crossJoin(auc_df) + + return transform + + +def binary_classifier_auc( + grouping_cols: Iterable[SparkColumn] = (), +) -> Transformation: + """Computes the AUC from the model metrics + for a binary classifier + """ + + def auc_func(group_cols, df): # type: ignore + return pd.DataFrame([(*group_cols, auc(df.fpr, df.recall))]) + + def transform(metrics_df: DataFrame) -> DataFrame: + schema = metrics_df.select(*grouping_cols).schema.add("auc", T.FloatType()) + return metrics_df.groupby(*grouping_cols).applyInPandas(auc_func, schema=schema) + + return transform + + +def compare_auc( + candidate_auc_df: DataFrame, + reference_auc_df: DataFrame, + compare_cols: Iterable[SparkColumn] = (), +) -> DataFrame: + """Compares the auc values between a candidate model and a reference model. + The input dataframes require an `auc` column""" + if not compare_cols: + # if there are no compare columns (e.g. there should be a single row) + # we use a dummy unit column + candidate_auc_df = candidate_auc_df.withColumn("unit", F.lit("")) + reference_auc_df = reference_auc_df.withColumn("unit", F.lit("")) + compare_cols = (F.col("unit"),) + + aucs_df = candidate_auc_df.withColumnRenamed("auc", "candidate_auc").join( + reference_auc_df.withColumnRenamed("auc", "reference_auc"), + on=[c._jc.toString() for c in compare_cols], # type: ignore + how="inner", + ) + + return ( + aucs_df.groupby( + F.round(F.col("candidate_auc") - F.col("reference_auc"), 2).alias( + "auc_diff" + ) + ) + .agg( + F.count("*").alias("count"), + F.collect_list(F.concat(*compare_cols)).alias("group"), + ) + .sort("auc_diff") + ) diff --git a/training/add_a_link/src/add_a_link/pipeline_transformations/transformation.py b/training/add_a_link/src/add_a_link/pipeline_transformations/transformation.py new file mode 100644 index 0000000000000000000000000000000000000000..088a248e44919e1943c65254ccc442118a9ab923 --- /dev/null +++ b/training/add_a_link/src/add_a_link/pipeline_transformations/transformation.py @@ -0,0 +1,152 @@ +import logging +from collections.abc import Callable +from typing import TypeAlias + +from pyspark.sql import DataFrame, SparkSession, Window, functions as F + +Transformation: TypeAlias = Callable[[DataFrame], DataFrame] + +ColName: TypeAlias = str + + +def stratified_sample( + cols: list[str], + strata_size: int, + seed: int | None, + repartition: bool = False, +) -> Transformation: + """Returns a same-size stratified sample based on where the strata are + all the unique values in `cols`. The maximum size of each stratum depends + equals `strata_size`. + """ + + def _(df: DataFrame) -> DataFrame: + # Partitioning by just *cols when taking a stratified sample might lead + # to large partitions that are also skewed. Instead we first give each row + # a "partition" number based on the hash code of its page_id (MurmurHash). + # page_id works well to distribute revisions across partitions because their + # distribution across page_ids is fairly uniform with relatively few outliers. + if repartition: + nonlocal strata_size + spark = SparkSession.builder.getOrCreate() + shuffle_partitions = int( + spark.conf.get("spark.sql.shuffle.partitions"), base=10 + ) + if strata_size > 10 * shuffle_partitions: + df = df.withColumn( + "partition", (F.rand() * shuffle_partitions).cast("int") + ) + cols.append("partition") + strata_size = strata_size // shuffle_partitions + 1 + else: + logging.warning("no partitioning applied, strata_size too small") + + window = Window.partitionBy(*cols).orderBy(F.rand(seed)) + df = ( + df.withColumn("row_num", F.row_number().over(window)) + .filter(f"row_num <= {strata_size}") + .drop("row_num") + ) + return df.drop("partition") + + return _ + + +def with_parent_revision_history( + parent_df: DataFrame, + rev_id: ColName = "revision_id", + parent_rev_id: ColName = "revision_parent_id", + wiki_db: ColName = "wiki_db", + keep_nulls: bool = False, +) -> Transformation: + """Aligns the revisions in the dataframe with their parent revisions + from `parent_df` using `rev_id` and `parent_rev_id`. The resulting dataframe + will have all existing columns both for the child and parent revisions, nested + in two struct columns ('revision' & 'parent') with identical fields. + """ + + def _(df: DataFrame) -> DataFrame: + rev_df = df.alias("rev") + rev_parent_df = parent_df.alias("parent") + + # join conditions + rev_id_matches = F.col(f"rev.{parent_rev_id}") == F.col(f"parent.{rev_id}") + wiki_db_matches = F.col(f"rev.{wiki_db}") == F.col(f"parent.{wiki_db}") + + rev_df = rev_df.join( + rev_parent_df, + on=((rev_id_matches) & (wiki_db_matches)), + how="left" if keep_nulls else "inner", + ) + + return rev_df.select( + F.struct("rev.*").alias("revision"), F.struct("parent.*").alias("parent") + ) + + return _ + + +def filter_disputed_reverts( + reverting_rev_df: DataFrame, + rev_id: ColName = "revision_id", + reverting_rev_id: ColName = "revision_first_identity_reverting_revision_id", + is_reverted: ColName = "revision_is_identity_reverted", + page_id: ColName = "page_id", + wiki_db: ColName = "wiki_db", +) -> Transformation: + """Filters all disputed reverts. A disputed revert is any revert + whose reverting revision has also been reverted. This phenomenon + is also known as edit wars. + """ + + def _(df: DataFrame) -> DataFrame: + rev_df = df.alias("rev") + reverting_df = reverting_rev_df.alias("reverter") + + # join conditions + rev_id_matches = F.col(f"rev.{reverting_rev_id}") == F.col(f"reverter.{rev_id}") + page_id_matches = F.col(f"rev.{page_id}") == F.col(f"reverter.{page_id}") + wiki_db_matches = F.col(f"rev.{wiki_db}") == F.col(f"reverter.{wiki_db}") + + rev_df = rev_df.join( + reverting_df, + on=((rev_id_matches) & (page_id_matches) & (wiki_db_matches)), + how="left", + ) + # filter out any reverts where the reverting revision + # is also reverted i.e. edit wars + rev_df = rev_df.filter( + F.col(f"reverter.{rev_id}").isNull() | ~F.col(f"reverter.{is_reverted}") + ) + return rev_df.select("rev.*") + + return _ + + +def with_wikitext( + wikitext_df: DataFrame, + rev_id: ColName = "revision_id", + wikitext_rev_id: ColName = "revision_id", + rev_text: ColName = "revision_text", + wiki_db: ColName = "wiki_db", +) -> Transformation: + """Returns a new dataframe by adding a column that contains the revision + wikitext, joining with `wikitext_df` on `rev_id` and `wikitext_rev_id` columns. + """ + + def _(df: DataFrame) -> DataFrame: + df = df.alias("rev") + wt_df = wikitext_df.alias("wt") + + # join conditions + rev_id_matches = F.col(f"rev.{rev_id}") == F.col(f"wt.{wikitext_rev_id}") + wiki_db_matches = F.col(f"rev.{wiki_db}") == F.col(f"wt.{wiki_db}") + + df = df.join( + wt_df, + on=((rev_id_matches) & (wiki_db_matches)), + how="inner", + ) + return df.select("rev.*", f"wt.{rev_text}") + + return _ diff --git a/training/add_a_link/src/add_a_link/utils.py b/training/add_a_link/src/add_a_link/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6b3b5606f0d7e182574dfe9bbefd408080eb5991 --- /dev/null +++ b/training/add_a_link/src/add_a_link/utils.py @@ -0,0 +1,621 @@ +import functools +import operator +import pathlib +import re +import time +import urllib.parse as up +from collections.abc import Generator +from typing import Any, cast + +import mwparserfromhell # type: ignore[import-untyped] +import numpy as np +import requests +import wikitextparser as wtp # type: ignore[import-untyped] +import xgboost +from Levenshtein import jaro as levenshtein_score +from mwtokenizer import Tokenizer # type: ignore[import-untyped] +from mwtokenizer.config.symbols import ( # type: ignore[import-untyped] + ALL_UNICODE_PUNCTUATION, +) +from scipy.stats import kurtosis # type: ignore[import-untyped] +from sklearn.preprocessing import OrdinalEncoder # type: ignore[import-untyped] + +from add_a_link.ngram_utils import ( + get_ngrams, + get_tokens, + tokenize_sentence, +) + +FREQUENCY = 10 + + +def hdfs_path(path: pathlib.Path) -> str: + return f"hdfs://analytics-hadoop{path}" + + +@functools.cache +def get_tokenizer(language_code: str) -> Tokenizer: + return Tokenizer(language_code) + + +class MentionRegexException(Exception): + status_code = 400 + + def __init__(self, mention: str, wikitext: str): + super().__init__() + self.message = f'Unable to find match for "{mention}" in "{wikitext}"' + + def to_dict(self) -> dict[str, str]: + return {"message": self.message} + + +###################### +# parsing titles +###################### + + +def normalise_title(title: str) -> str: + """ + Normalising title (links) + - deal with quotes + - strip() + - '_'--> ' ' + - capitalize first letter + """ + title = up.unquote(title) + title = title.strip() + if len(title) > 0: + title = title[0].upper() + title[1:] + n_title = title.replace("_", " ") + if "#" in n_title: + n_title = n_title.split("#")[0] + return n_title + + +def normalise_anchor(anchor: str) -> str: + """ + Normalising anchor (text): + - strip() + - lowercase + Note that we do not do the other normalisations since we want to match the strings + from the text + """ + # anchor = up.unquote(anchor) + n_anchor = anchor.strip() # .replace("_", " ") + return n_anchor.lower() + + +def wtpGetLinkAnchor(wikilink: wtp.WikiLink) -> tuple[str, str]: + """ + extract anchor and link from a wikilink from wikitextparser. + normalise title and anchor + """ + # normalise the article title (quote, first letter capital) + link_tmp = wikilink.title + link = normalise_title(link_tmp) + # normalise the anchor text (strip and lowercase) + anchor_tmp = wikilink.text if wikilink.text else link_tmp + anchor = normalise_anchor(anchor_tmp) + return link, anchor + + +def getLinks( + wikicode: str, + redirects: dict[str, str] | None = None, + pageids: dict[str, int] | None = None, +) -> dict[str, str]: + """ + get all links in a page + """ + link_dict = {} + linklist = wtp.parse(str(wikicode)).wikilinks + for lnk in linklist: + link, anchor = wtpGetLinkAnchor(lnk) + # if redirects is not None, resolve the redirect + if redirects is not None: + link = resolveRedirect(link, redirects) + # if pageids is not None, keep only links appearing as key in pageids + if pageids is not None: + if link not in pageids: + continue + link_dict[anchor] = link + return link_dict + + +def resolveRedirect(link: str, redirects: dict[str, str]) -> str: + """ + resolve the redirect. + check whether in pageids (main namespace) + + """ + return redirects.get(link, link) + + +def ngram_iterator( + text: str, tokenizer: Tokenizer, gram_length_max: int, gram_length_min: int = 1 +) -> Generator[str, None, None]: + """ + iterator yields all n-grams from a text. + - splits at newline + - spits sentence + - tokenizes + - create string of n tokens for variable n + """ + for sent in tokenize_sentence(text, tokenizer): + tokens = get_tokens(sent, tokenizer) + for gram_length in range(gram_length_max, gram_length_min - 1, -1): + yield from get_ngrams(tokens, gram_length) + + +########################## +# getting feature-dataset +########################## + + +def embeddings_distance(a: list[float], b: list[float]) -> float: + dst = 0.0 + try: + norm_ab = np.linalg.norm(b) * np.linalg.norm(a) + # if norm of any vector is 0, we assign dst=0 (maximum dst) + if norm_ab == 0: + dst = 0.0 + else: + dst = np.dot(a, b) / norm_ab + # dst = (np.dot(a, b) / np.linalg.norm(a) / np.linalg.norm(b)) + except Exception as e: + print("ERROR: ", type(e).__name__, e) + pass + if np.isnan(dst): + dst = 0 + return float(dst) + + +def getDistEmb(ent_a: str, ent_b: str, embd: dict[str, list[float]]) -> float: + dst = 0.0 + try: # try if entities are in embd + a = embd[ent_a] + b = embd[ent_b] + norm_ab = np.linalg.norm(b) * np.linalg.norm(a) + # if norm of any vector is 0, we assign dst=0 (maximum dst) + if norm_ab == 0: + dst = 0.0 + else: + dst = np.dot(a, b) / norm_ab + # dst = (np.dot(a, b) / np.linalg.norm(a) / np.linalg.norm(b)) + except KeyError: + # Embedding not found, ignore key + pass + except Exception as e: + print("ERROR: ", type(e).__name__, e) + pass + if np.isnan(dst): + dst = 0 + return dst + + +def get_feature_set( + page: str, + text: str, + link: str, + anchors: dict[str, dict[str, int]], + word2vec: dict[str, list[float]], + wiki_id: str, + tokenizer: Tokenizer, +) -> tuple[int, int, float, float, float, float, str]: + ngram = list(tokenizer.word_tokenize(text, use_abbreviation=True)) # tokenize text + ngram = list( + filter(lambda x: not x.startswith(" "), ngram) + ) # could be a single or multiple spaces that needs to be removed + ngram = list( + filter(lambda x: x not in ALL_UNICODE_PUNCTUATION, ngram) + ) # remove punctuation + ngram_len = len(ngram) + freq = anchors[text][link] # How many times was the link use with this text + ambig = len(anchors[text]) # home many different links where used with this text + kur = kurtosis( + sorted(list(anchors[text].values()), reverse=True) + [1] * (1000 - ambig) + ) # Skew of usage text/link distribution + w2v = getDistEmb( + page, link, word2vec + ) # W2V Distance between the source and target page + leven = levenshtein_score(text.lower(), link.lower()) + return (ngram_len, freq, ambig, float(kur), float(w2v), float(leven), wiki_id) + + +########################## +# evaluation classification +########################## +# Main decision function. +# For a given page X and a piece of text "lipsum".. check all the candidate and make +# inference. Returns the most likely candidate according to the pre-trained link model. +# If the probability is below a certain threshold, return None +def classify_links( + page: str, + text: str, + anchors: dict[str, dict[str, int]], + word2vec: dict[str, list[float]], + model: xgboost.XGBClassifier, + wiki_db_encoder: OrdinalEncoder, + wiki_id: str, + tokenizer: Tokenizer, + threshold: float = 0.95, +) -> tuple[str, float] | None: + # start_time = time.time() + cand_prediction = {} + # Work with the `FREQUENCY` most frequent candidates + limited_cands = anchors[text] + if len(limited_cands) > FREQUENCY: + limited_cands = dict( + sorted(anchors[text].items(), key=operator.itemgetter(1), reverse=True)[:10] + ) + for cand in limited_cands: + # get the features + ngram, freq, ambig, kur, w2v, leven, wiki_id = get_feature_set( + page, text, cand, anchors, word2vec, wiki_id, tokenizer + ) + cand_feats = ( + ngram, + freq, + ambig, + kur, + w2v, + leven, + wiki_db_encoder.transform([[wiki_id]])[0][0], + ) + + # compute the model probability + cand_prediction[cand] = model.predict_proba( + np.array(cand_feats).reshape((1, -1)) + )[0, 1] + + # Compute the top candidate + if not cand_prediction: + return None + top_candidate = max(cand_prediction.items(), key=operator.itemgetter(1)) + + # Check if the max probability meets the threshold before returning + if top_candidate[1] < threshold: + return None + # print("--- %s seconds ---" % (time.time() - start_time)) + return top_candidate + + +# helper class to break out of nested for-loop when reaching set number of +# recommendations +class MaxRecError(Exception): + pass + + +# Helper class to break out of page processing loop when maximum page processing time +# has been reached. +class MaxTimeError(Exception): + pass + + +# Actual Linking function +def process_page( # noqa: PLR0915, PLR0912 + wikitext: str, + page: str, + anchors: dict[str, dict[str, int]], + pageids: dict[str, int], + redirects: dict[str, str], + word2vec: dict[str, list[float]], + model: xgboost.XGBClassifier, + wiki_db_encoder: OrdinalEncoder, + wiki_id: str, + language_code: str, + tokenizer: Tokenizer, + threshold: float = 0.8, + pr: bool = True, + return_wikitext: bool = True, + context: int = 10, + maxrec: int = -1, + sections_to_exclude: list[str] | None = None, +) -> dict[str, Any] | mwparserfromhell.wikicode.Wikicode: + """ + Recommend links for a given wikitext. + + :param str wikitext: Page source + :param str page: Page title + :param dict anchors: Anchor dataset for the wiki + (link text -> {link target title -> frequency}) + :param dict pageids: Pageid dataset for the wiki (title -> id) + :param dict redirects: Redirect dataset for the wiki + (original title -> redirect target) + :param dict word2vec: word2vec dataset for the wiki (word -> vector) + :param xgboost.XGBClassifier model: The wiki's model for predicting link targets + for words. + :param string language_code: The ISO 639 language code to use with processing. + :param float threshold: Minimum probability score required to include a prediction + :param bool pr: Whether to include probability scores in the wikitext as 'pr' link + parameters. + :param bool return_wikitext: Whether to return wikitext or data. + :param int context: The number of characters before/after the link to include when + returning data. + :param int maxrec: Maximum number of recommendations to return (-1 for unlimited) + :param list sections_to_exclude: List of section names to exclude from link + suggestion generation, e.g. "References" + :return: When return_wikitext is true, return updated wikitext with the new links + added (or pseudo-wikitext with the custom 'pr' parameters if pr=True). + Otherwise, return a data structure suitable for returning from the API. + :rtype: string or dict + """ + from icu import Locale, UnicodeString # type: ignore[import-untyped] + + if sections_to_exclude is None: + sections_to_exclude = [] + sections_to_exclude_nocase = list( + section.casefold() for section in sections_to_exclude + ) + tokenizer = Tokenizer(language_code=language_code) + + response = {"links": cast(list[dict[str, Any]], []), "info": ""} + init_time = time.time() + # Give ourselves a one second buffer to return the response after the + # configured timeout limit has been reached. + max_page_process_time_buffer = 1 + max_page_process_time = 30 - max_page_process_time_buffer + page_wikicode = mwparserfromhell.parse(wikitext) + + page_wikicode_init = str(page_wikicode) # save the initial state + page_wikicode_text_nodes = page_wikicode.filter_text(recursive=False) + + # get all existing links + dict_links = getLinks( + page_wikicode, redirects=redirects, pageids=pageids + ) # get all links, resolve redirects + linked_mentions = set(dict_links.keys()) + linked_links = set(dict_links.values()) + # include also current pagetitle + linked_mentions.add(normalise_anchor(page)) + linked_links.add(normalise_title(page)) + + tested_mentions = set() + + # try-except to break out of nested for-loop once we found maxrec links to add + try: + for section in page_wikicode.get_sections( + include_lead=True, include_headings=True, flat=True + ): + if not section: + # This means the section is empty; one instance where this occurs + # is in articles that have no lead and instead begin with a section + # heading + continue + # Special-handling for lead section, which doesn't have a name. + if ( + not isinstance(section.nodes[0], mwparserfromhell.nodes.heading.Heading) + and "%LEAD%" in sections_to_exclude + ): + continue + + section_heading = str(section.nodes[0].title).strip() + if section_heading.casefold() in sections_to_exclude_nocase: + continue + for node in section.filter_text(recursive=False): + mentions = {} + # check the offset of the node in the wikitext_init + node_val = node.value + i1_node_init = page_wikicode_init.find(node_val) + i2_node_init = i1_node_init + len(node_val) + # The ngram_iterator generates substrings from the text of the article + # to check as candidate-anchors for links. It will do that by + # concatenating individual word-tokens to ngrams (strings that consist + # of n tokens); for example "Atlantic Ocean" would be a 2-gram. + # The arguments gram_length_max, gram_length_min define the range + # in which we vary n The current range n=5,...,1 means we first check + # all substrings of length 5, then 4, and so on until we reach 1. + # This range is defined by looking at the typical size of existing + # links in the anchor-dictionary. There are text-anchors that are not + # covered by this; they have much larger values for n; however, most + # anchors have small values of n. Reducing the range of the + # ngram-iterator we have fewer substrings for which we check the + # anchor-dictionary (and subsequently other lookups from checking + # whether to put a link). + grams = ngram_iterator( + text=node, + tokenizer=tokenizer, + gram_length_max=5, + gram_length_min=1, + ) + for gram in grams: + if time.time() > init_time + max_page_process_time: + response["info"] = ( + "Stopping page processing as maximum processing time " + f"{max_page_process_time + max_page_process_time_buffer}" + "seconds reached" + ) + raise MaxTimeError + mentions[gram.lower()] = gram + + if not mentions: + continue + + anchors_with_mentions = anchors + + for mention, mention_original in mentions.items(): + if ( + # if the mention exist in the DB + mention in anchors_with_mentions + # it was not previously linked (or part of a link) + and not any(mention in s for s in linked_mentions) + # none of its candidate links is already used + and not bool( + set(anchors_with_mentions[mention].keys()) & linked_links + ) + # it was not tested before (for efficiency) + and mention not in tested_mentions + ): + # logic + # print("testing:", mention, len(anchors[mention])) + candidate = classify_links( + page, + mention, + anchors_with_mentions, + word2vec, + model, + wiki_db_encoder, + wiki_id, + tokenizer, + threshold=threshold, + ) + if candidate: + candidate_link, candidate_proba = candidate + # print(">> ", mention, candidate) + ############## Critical ############## + # Insert The Link in the current wikitext + mention_regex = re.compile( + rf"(?){re.escape(mention_original)}(?![\w\s]*[\]\]])" + ) + mention_regex_i = re.compile( + mention_regex.pattern, re.IGNORECASE + ) + new_str = "[[" + candidate_link + "|" + mention_original + # add the probability + if pr: + new_str += "|pr=" + str(candidate_proba) + new_str += "]]" + newval, found = mention_regex.subn(new_str, node.value, 1) + node.value = newval + ###################################### + # Book-keeping + linked_mentions.add(mention) + linked_links.add(candidate_link) + if found == 1: + page_wikicode_init_substr = page_wikicode_init[ + i1_node_init:i2_node_init + ] + # Handle lower-casing of characters in some languages, + # e.g. in Azeri, İnsan should be lowercased to insan, + # but the default lower-casing in python will change + # the İ to an i with two dots. + page_wikicode_init_substr_lower = str( + UnicodeString(page_wikicode_init_substr).toLower( + Locale(language_code) + ) + ) + match = mention_regex_i.search( + page_wikicode_init_substr_lower + ) + if match is None: + raise MentionRegexException( + mention, page_wikicode_init_substr_lower + ) + i1_sub = match.start() + start_offset = i1_node_init + i1_sub + end_offset = start_offset + len(mention) + # provide context of the mention (+/- c characters in + # substring and wikitext) + if context is None: + context_wikitext = mention_original + context_substring = mention_original + else: + ## context substring + str_context = page_wikicode_init_substr + i1_c = max([0, i1_sub - context]) + i2_c = min( + [ + len(str_context), + i1_sub + len(mention_original) + context, + ] + ) + context_substring = [ + str_context[i1_c:i1_sub], + str_context[ + i1_sub + len(mention_original) : i2_c + ], + ] + ## wikitext substring + str_context = wikitext + i1_c = max([0, start_offset - context]) + i2_c = min( + [ + len(str_context), + end_offset + context, + ] + ) + context_wikitext = [ + str_context[i1_c:i1_sub], + str_context[ + i1_sub + len(mention_original) : i2_c + ], + ] + # Find 0-based index of anchor text match in a way that + # hopefully mostly survives wikitext to HTML + # transformation: count occurrences of the text + # in top-level text nodes u + preceding_nodes = page_wikicode_text_nodes[ + : page_wikicode_text_nodes.index(node) + ] + match_index = sum( + str(node).count(mention_original) + for node in preceding_nodes + ) + page_wikicode_init_substr[:i1_sub].count( + mention_original + ) + new_link: dict[str, Any] = { + "link_target": candidate_link, + "link_text": mention_original, + "score": float(candidate_proba), + "start_offset": start_offset, + "end_offset": end_offset, + "match_index": match_index, + "context_wikitext": context_wikitext, + "context_plaintext": context_substring, + } + prev_links = list(response["links"]) + prev_links.append(new_link) + response["links"] = prev_links + # stop iterating the wikitext to generate link + # recommendations as soon as we have maxrec + # link-recommendations + if len(response["links"]) == maxrec: + response["info"] = ( + "Stopping page processing as max " + "recommendations limit {maxrec} reached." + ) + raise MaxRecError + # More Book-keeping + tested_mentions.add(mention) + except (MaxRecError, MaxTimeError) as e: + print("ERROR: ", type(e).__name__, e) + pass + # if yes, we return the adapted wikitext + # else just return list of links with offsets + if return_wikitext: + return page_wikicode + else: + return response + + +def get_wiki_url(wiki_id: str) -> str: + """ + generate a wiki's url by replacing underscores "_" with hyphen "-" and + matching wikipedia domain e.g "bat_smgwiki" becomes + "https://bat-smg.wikipedia.org/w/api.php" + + :param str wiki_id: Wikipedia wiki ID + """ + chars_to_replace = {"_": "-", "wiki": ".wikipedia.org"} + for k, v in chars_to_replace.items(): + wiki_id = wiki_id.replace(k, v) + wiki_url = "https://" + wiki_id + "/w/api.php" + return wiki_url + + +def get_language_code(wiki_url: str) -> str: + """ + use a wiki's url to get its language code via API siteinfo + + :param str wiki_url: Wikipedia API URL + """ + params = { + "action": "query", + "meta": "siteinfo", + "formatversion": "2", + "format": "json", + } + response = requests.get(url=wiki_url, params=params) + data = response.json() + language_code = data["query"]["general"]["lang"] + return str(language_code) diff --git a/training/add_a_link/src/add_a_link/wikitext_to_plaintext.py b/training/add_a_link/src/add_a_link/wikitext_to_plaintext.py new file mode 100644 index 0000000000000000000000000000000000000000..ceda0049e6bd73d067f3c3bb8a2c128ef5b42fdc --- /dev/null +++ b/training/add_a_link/src/add_a_link/wikitext_to_plaintext.py @@ -0,0 +1,718 @@ +# helper functions for handling mwparserfromhell / wikitext +# adapted from https://github.com/geohci/edit-types/blob/main/mwedittypes/utils.py + +from typing import Any + +import mwparserfromhell as mw # type: ignore[import-untyped] + +TEXT_FORMATTING_TAGS = ( + "b", + "i", + "s", + "u", + "del", + "ins", + "hr", + "br", + "pre", + "nowiki", + "small", + "big", + "sub", + "sup", + "font", + "blockquote", + "span", + "center", +) +TABLE_ELEMENTS_TAGS = ("th", "tr", "td") +LIST_TAGS = ("li", "dt", "dd", "ul", "ol", "dl") + +MEDIA_PREFIXES = ["File", "Image"] +CAT_PREFIXES = ["Category"] + +# Source: for each Wikipedia language code (example shown for "ab"), +# aliases for namespaces -2 and 6 accessed via this API call: +# https://ab.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespacealiases|namespaces&format=json&formatversion=2 +# Last accessed: 21 December 2021 +MEDIA_ALIASES = { + "ab": ["Медиа", "Файл", "Афаил", "Амедиа", "Изображение"], + "ace": ["Beureukaih", "Gambar", "Alat", "Berkas"], + "ady": ["Медиа"], + "af": ["Lêer", "Beeld"], + "als": ["Medium", "Datei", "Bild"], + "am": ["ፋይል", "ስዕል"], + "an": ["Imachen", "Imagen"], + "ang": ["Ymele", "Biliþ"], + "ar": ["ميديا", "صورة", "وسائط", "ملف"], + "arc": ["ܠܦܦܐ", "ܡܝܕܝܐ"], + "arz": ["ميديا", "صورة", "وسائط", "ملف"], + "as": ["চিত্ৰ", "चित्र", "চিত্র", "মাধ্যম"], + "ast": ["Imaxen", "Ficheru", "Imaxe", "Archivu", "Imagen", "Medios"], + "atj": ["Tipatcimoctakewin", "Natisinahikaniwoc"], + "av": ["Медиа", "Файл", "Изображение"], + "ay": ["Medio", "Archivo", "Imagen"], + "az": ["Mediya", "Şəkil", "Fayl"], + "azb": ["رسانه", "تصویر", "مدیا", "فایل", "رسانه‌ای"], + "ba": ["Медиа", "Рәсем", "Файл", "Изображение"], + "bar": ["Medium", "Datei", "Bild"], + "bat-smg": ["Vaizdas", "Medėjė", "Abruozdielis"], + "bcl": ["Medio", "Ladawan"], + "be": ["Мультымедыя", "Файл", "Выява"], + "be-x-old": ["Мэдыя", "Файл", "Выява"], + "bg": ["Медия", "Файл", "Картинка"], + "bh": ["मीडिया", "चित्र"], + "bjn": ["Barakas", "Gambar", "Berkas"], + "bm": ["Média", "Fichier"], + "bn": ["চিত্র", "মিডিয়া"], + "bpy": ["ছবি", "মিডিয়া"], + "br": ["Skeudenn", "Restr"], + "bs": ["Mediji", "Slika", "Datoteka", "Medija"], + "bug": ["Gambar", "Berkas"], + "bxr": ["Файл", "Меди", "Изображение"], + "ca": ["Fitxer", "Imatge"], + "cbk-zam": ["Medio", "Archivo", "Imagen"], + "cdo": ["文件", "媒體", "圖像", "檔案"], + "ce": ["Хlум", "Медиа", "Сурт", "Файл", "Медйа", "Изображение"], + "ceb": ["Payl", "Medya", "Imahen"], + "ch": ["Litratu"], + "ckb": ["میدیا", "پەڕگە"], + "co": ["Immagine"], + "crh": ["Медиа", "Resim", "Файл", "Fayl", "Ресим"], + "cs": ["Soubor", "Média", "Obrázok"], + "csb": ["Òbrôzk", "Grafika"], + "cu": ["Видъ", "Ви́дъ", "Дѣло", "Срѣдьства"], + "cv": ["Медиа", "Ӳкерчĕк", "Изображение"], + "cy": ["Delwedd"], + "da": ["Billede", "Fil"], + "de": ["Medium", "Datei", "Bild"], + "din": ["Ciɛl", "Apamduööt"], + "diq": ["Medya", "Dosya"], + "dsb": ["Wobraz", "Dataja", "Bild", "Medija"], + "dty": ["चित्र", "मिडिया"], + "dv": ["ފައިލު", "މީޑިއާ", "ފައިލް"], + "el": ["Εικόνα", "Αρχείο", "Μέσο", "Μέσον"], + "eml": ["Immagine"], + "eo": ["Dosiero", "Aŭdvidaĵo"], + "es": ["Medio", "Archivo", "Imagen"], + "et": ["Pilt", "Fail", "Meedia"], + "eu": ["Irudi", "Fitxategi"], + "ext": ["Archivu", "Imagen", "Mediu"], + "fa": ["رسانه", "تصویر", "مدیا", "پرونده", "رسانه‌ای"], + "ff": ["Média", "Fichier"], + "fi": ["Kuva", "Tiedosto"], + "fiu-vro": ["Pilt", "Meediä"], + "fo": ["Miðil", "Mynd"], + "fr": ["Média", "Fichier"], + "frp": ["Émâge", "Fichiér", "Mèdia"], + "frr": ["Medium", "Datei", "Bild"], + "fur": ["Immagine", "Figure"], + "fy": ["Ofbyld"], + "ga": ["Íomhá", "Meán"], + "gag": ["Mediya", "Medya", "Resim", "Dosya", "Dosye"], + "gan": [ + "媒体文件", + "文件", + "文檔", + "档案", + "媒體", + "图像", + "圖像", + "媒体", + "檔案", + ], + "gd": ["Faidhle", "Meadhan"], + "gl": ["Imaxe", "Ficheiro", "Arquivo", "Imagem"], + "glk": ["رسانه", "تصویر", "پرونده", "فاىل", "رسانه‌ای", "مديا"], + "gn": ["Medio", "Imagen", "Ta'ãnga"], + "gom": ["माध्यम", "मिडिया", "फायल"], + "gor": ["Gambar", "Berkas"], + "got": ["𐍆𐌴𐌹𐌻𐌰"], + "gu": ["દ્રશ્ય-શ્રાવ્ય (મિડિયા)", "દ્રશ્ય-શ્રાવ્ય_(મિડિયા)", "ચિત્ર"], + "gv": ["Coadan", "Meanyn"], + "hak": ["文件", "媒體", "圖像", "檔案"], + "haw": ["Kiʻi", "Waihona", "Pāpaho"], + "he": ["תמונה", "קו", "מדיה", "קובץ"], + "hi": ["मीडिया", "चित्र"], + "hif": ["file", "saadhan"], + "hr": ["Mediji", "DT", "Slika", "F", "Datoteka"], + "hsb": ["Wobraz", "Dataja", "Bild"], + "ht": ["Imaj", "Fichye", "Medya"], + "hu": ["Kép", "Fájl", "Média"], + "hy": ["Պատկեր", "Մեդիա"], + "ia": ["Imagine", "Multimedia"], + "id": ["Gambar", "Berkas"], + "ig": ["Nká", "Midia", "Usòrò", "Ákwúkwó orünotu", "Ákwúkwó_orünotu"], + "ii": ["媒体文件", "文件", "档案", "图像", "媒体"], + "ilo": ["Midia", "Papeles"], + "inh": ["Медиа", "Файл", "Изображение"], + "io": ["Imajo", "Arkivo"], + "is": ["Miðill", "Mynd"], + "it": ["Immagine"], + "ja": ["メディア", "ファイル", "画像"], + "jbo": ["velsku", "datnyvei"], + "jv": ["Barkas", "Medhia", "Gambar", "Médhia"], + "ka": ["მედია", "სურათი", "ფაილი"], + "kaa": ["Swret", "Таспа", "سۋرەت", "Taspa", "Su'wret", "Сурет", "تاسپا"], + "kab": ["Tugna"], + "kbd": ["Медиа", "Файл"], + "kbp": ["Média", "Fichier"], + "kg": ["Fisye"], + "kk": ["Swret", "سۋرەت", "Таспа", "Taspa", "Сурет", "تاسپا"], + "kl": ["Billede", "Fiileq", "Fil"], + "km": ["ឯកសារ", "រូបភាព", "មេឌា", "មីឌា"], + "kn": ["ಚಿತ್ರ", "ಮೀಡಿಯ"], + "ko": ["미디어", "파일", "그림"], + "koi": ["Медиа", "Файл", "Изображение"], + "krc": ["Медиа", "Файл", "Изображение"], + "ks": ["میڈیا", "فَیِل"], + "ksh": [ + "Beld", + "Meedije", + "Medie", + "Belld", + "Medium", + "Datei", + "Meedijum", + "Bild", + ], + "ku": ["میدیا", "پەڕگە", "Medya", "Wêne"], + "kv": ["Медиа", "Файл", "Изображение"], + "kw": ["Restren"], + "ky": ["Медиа", "Файл"], + "la": ["Imago", "Fasciculus"], + "lad": ["Dossia", "Medya", "Archivo", "Dosya", "Imagen", "Meddia"], + "lb": ["Fichier", "Bild"], + "lbe": ["Медиа", "Сурат", "Изображение"], + "lez": ["Медиа", "Mediya", "Файл", "Şəkil", "Изображение"], + "lfn": ["Fix"], + "li": ["Afbeelding", "Plaetje", "Aafbeilding"], + "lij": ["Immaggine", "Immagine"], + "lmo": ["Immagine", "Imàjine", "Archivi"], + "ln": ["Média", "Fichier"], + "lo": ["ສື່ອ", "ສື່", "ຮູບ"], + "lrc": ["رسانه", "تصویر", "رسانه‌ای", "جانیا", "أسگ", "ڤارئسگأر"], + "lt": ["Vaizdas", "Medija"], + "ltg": ["Medeja", "Fails"], + "lv": ["Attēls"], + "mai": ["मेडिया", "फाइल"], + "map-bms": ["Barkas", "Medhia", "Gambar", "Médhia"], + "mdf": ["Медиа", "Няйф", "Изображение"], + "mg": ["Rakitra", "Sary", "Média"], + "mhr": ["Медиа", "Файл", "Изображение"], + "min": ["Gambar", "Berkas"], + "mk": ["Податотека", "Медија", "Медиум", "Слика"], + "ml": ["പ്രമാണം", "ചി", "മീഡിയ", "പ്ര", "ചിത്രം"], + "mn": ["Медиа", "Файл", "Зураг"], + "mr": ["चित्र", "मिडिया"], + "mrj": ["Медиа", "Файл", "Изображение"], + "ms": ["Fail", "Imej"], + "mt": ["Midja", "Medja", "Stampa"], + "mwl": ["Multimédia", "Fexeiro", "Ficheiro", "Arquivo", "Imagem"], + "my": ["ဖိုင်", "မီဒီယာ"], + "myv": ["Медия", "Артовкс", "Изображение"], + "mzn": ["رسانه", "تصویر", "مه‌دیا", "مدیا", "پرونده", "رسانه‌ای"], + "nah": ["Mēdiatl", "Īxiptli", "Imagen"], + "nap": ["Fiùra", "Immagine"], + "nds": ["Datei", "Bild"], + "nds-nl": ["Ofbeelding", "Afbeelding", "Bestaand"], + "ne": ["मीडिया", "चित्र"], + "new": ["किपा", "माध्यम"], + "nl": ["Bestand", "Afbeelding"], + "nn": ["Fil", "Bilde", "Filpeikar"], + "no": ["Fil", "Medium", "Bilde"], + "nov": [], + "nrm": ["Média", "Fichier"], + "nso": ["Seswantšho"], + "nv": ["Eʼelyaaígíí"], + "oc": ["Imatge", "Fichièr", "Mèdia"], + "olo": ["Kuva", "Medii", "Failu"], + "or": ["ମାଧ୍ୟମ", "ଫାଇଲ"], + "os": ["Ныв", "Медиа", "Файл", "Изображение"], + "pa": ["ਤਸਵੀਰ", "ਮੀਡੀਆ"], + "pcd": ["Média", "Fichier"], + "pdc": ["Medium", "Datei", "Bild", "Feil"], + "pfl": ["Dadai", "Medium", "Datei", "Bild"], + "pi": ["मीडिया", "पटिमा"], + "pl": ["Plik", "Grafika"], + "pms": ["Figura", "Immagine"], + "pnb": ["میڈیا", "تصویر", "فائل"], + "pnt": ["Εικόνα", "Αρχείον", "Εικόναν", "Μέσον"], + "ps": ["انځور", "رسنۍ", "دوتنه"], + "pt": ["Multimédia", "Ficheiro", "Arquivo", "Imagem"], + "qu": ["Midya", "Imagen", "Rikcha"], + "rm": ["Multimedia", "Datoteca"], + "rmy": ["Fişier", "Mediya", "Chitro", "Imagine"], + "ro": ["Fişier", "Imagine", "Fișier"], + "roa-rup": ["Fişier", "Imagine", "Fișier"], + "roa-tara": ["Immagine"], + "ru": ["Медиа", "Файл", "Изображение"], + "rue": ["Медіа", "Медиа", "Файл", "Изображение", "Зображення"], + "rw": ["Dosiye", "Itangazamakuru"], + "sa": ["चित्रम्", "माध्यमम्", "सञ्चिका", "माध्यम", "चित्रं"], + "sah": ["Миэдьийэ", "Ойуу", "Билэ", "Изображение"], + "sat": ["ᱨᱮᱫ", "ᱢᱤᱰᱤᱭᱟ"], + "sc": ["Immàgini"], + "scn": ["Immagine", "Mmàggini", "Mèdia"], + "sd": ["عڪس", "ذريعات", "فائل"], + "se": ["Fiila"], + "sg": ["Média", "Fichier"], + "sh": ["Mediji", "Slika", "Медија", "Datoteka", "Medija", "Слика"], + "si": ["රූපය", "මාධ්‍යය", "ගොනුව"], + "sk": ["Súbor", "Obrázok", "Médiá"], + "sl": ["Slika", "Datoteka"], + "sq": ["Figura", "Skeda"], + "sr": [ + "Датотека", + "Medij", + "Slika", + "Медија", + "Datoteka", + "Медиј", + "Medija", + "Слика", + ], + "srn": ["Afbeelding", "Gefre"], + "stq": ["Bielde", "Bild"], + "su": ["Média", "Gambar"], + "sv": ["Fil", "Bild"], + "sw": ["Faili", "Picha"], + "szl": ["Plik", "Grafika"], + "ta": ["படிமம்", "ஊடகம்"], + "tcy": ["ಮಾದ್ಯಮೊ", "ಫೈಲ್"], + "te": ["ఫైలు", "దస్త్రం", "బొమ్మ", "మీడియా"], + "tet": ["Imajen", "Arquivo", "Imagem"], + "tg": ["Акс", "Медиа"], + "th": ["ไฟล์", "สื่อ", "ภาพ"], + "ti": ["ፋይል", "ሜድያ"], + "tk": ["Faýl"], + "tl": ["Midya", "Talaksan"], + "tpi": ["Fail"], + "tr": ["Medya", "Resim", "Dosya", "Ortam"], + "tt": ["Медиа", "Рәсем", "Файл", "Räsem", "Изображение"], + "ty": ["Média", "Fichier"], + "tyv": ["Медиа", "Файл", "Изображение"], + "udm": ["Медиа", "Файл", "Суред", "Изображение"], + "ug": ["ۋاسىتە", "ھۆججەت"], + "uk": ["Медіа", "Медиа", "Файл", "Изображение", "Зображення"], + "ur": ["میڈیا", "تصویر", "وسیط", "زریعہ", "فائل", "ملف"], + "uz": ["Mediya", "Tasvir", "Fayl"], + "vec": ["Immagine", "Imàjine", "Mèdia"], + "vep": ["Pilt", "Fail"], + "vi": ["Phương_tiện", "Tập_tin", "Hình", "Tập tin", "Phương tiện"], + "vls": ["Afbeelding", "Ofbeeldienge"], + "vo": ["Ragiv", "Magod", "Nünamakanäd"], + "wa": ["Imådje"], + "war": ["Medya", "Fayl", "Paypay"], + "wo": ["Xibaarukaay", "Dencukaay"], + "wuu": ["文件", "档案", "图像", "媒体"], + "xal": ["Аһар", "Боомг", "Изображение", "Зург"], + "xmf": ["მედია", "სურათი", "ფაილი"], + "yi": ["מעדיע", "תמונה", "טעקע", "בילד"], + "yo": ["Fáìlì", "Amóhùnmáwòrán", "Àwòrán"], + "za": ["媒体文件", "文件", "档案", "图像", "媒体"], + "zea": ["Afbeelding", "Plaetje"], + "zh": [ + "媒体文件", + "F", + "文件", + "媒體", + "档案", + "图像", + "圖像", + "媒体", + "檔案", + ], + "zh-classical": ["文件", "媒體", "圖像", "檔案"], + "zh-min-nan": ["tóng-àn", "文件", "媒體", "Mûi-thé", "圖像", "檔案"], + "zh-yue": [ + "檔", + "档", + "文件", + "图", + "媒體", + "圖", + "档案", + "图像", + "圖像", + "媒体", + "檔案", + ], +} + +# Source: for each Wikipedia language code (example shown for "ab"), aliases for +# namespace 14 accessed via this API call: +# https://ab.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespacealiases|namespaces&format=json&formatversion=2 +# Last accessed: 21 December 2021 +CAT_ALIASES = { + "ab": ["Категория", "Акатегориа"], + "ace": ["Kawan", "Kategori"], + "af": ["Kategorie"], + "ak": ["Nkyekyem"], + "als": ["Kategorie"], + "am": ["መደብ"], + "an": ["Categoría"], + "ang": ["Flocc"], + "ar": ["تصنيف"], + "arc": ["ܣܕܪܐ"], + "arz": ["تصنيف"], + "as": ["CAT", "শ্ৰেণী", "श्रेणी", "শ্রেণী"], + "ast": ["Categoría"], + "atj": ["Tipanictawin"], + "av": ["Категория"], + "ay": ["Categoría"], + "az": ["Kateqoriya"], + "azb": ["بؤلمه"], + "ba": ["Төркөм", "Категория"], + "bar": ["Kategorie"], + "bat-smg": ["Kategorija", "Kateguorėjė"], + "bcl": ["Kategorya"], + "be": ["Катэгорыя"], + "be-x-old": ["Катэгорыя"], + "bg": ["Категория"], + "bh": ["श्रेणी"], + "bjn": ["Tumbung", "Kategori"], + "bm": ["Catégorie"], + "bn": ["বিষয়শ্রেণী", "വിഭാഗം"], + "bpy": ["থাক"], + "br": ["Rummad"], + "bs": ["Kategorija"], + "bug": ["Kategori"], + "bxr": ["Категори", "Категория"], + "ca": ["Categoria"], + "cbk-zam": ["Categoría"], + "cdo": ["分類"], + "ce": ["Категори", "Тоба", "Кадегар"], + "ceb": ["Kategoriya"], + "ch": ["Katigoria"], + "ckb": ["پ", "پۆل"], + "co": ["Categoria"], + "crh": ["Категория", "Kategoriya"], + "cs": ["Kategorie"], + "csb": ["Kategòrëjô"], + "cu": ["Катигорї", "Категория", "Катигорїꙗ"], + "cv": ["Категори"], + "cy": ["Categori"], + "da": ["Kategori"], + "de": ["Kategorie"], + "din": ["Bekätakthook"], + "diq": ["Kategoriye", "Kategori"], + "dsb": ["Kategorija"], + "dty": ["श्रेणी"], + "dv": ["ޤިސްމު"], + "el": ["Κατηγορία"], + "eml": ["Categoria"], + "eo": ["Kategorio"], + "es": ["CAT", "Categoría"], + "et": ["Kategooria"], + "eu": ["Kategoria"], + "ext": ["Categoría", "Categoria"], + "fa": ["رده"], + "ff": ["Catégorie"], + "fi": ["Luokka"], + "fiu-vro": ["Katõgooria"], + "fo": ["Bólkur"], + "fr": ["Catégorie"], + "frp": ["Catègorie"], + "frr": ["Kategorie"], + "fur": ["Categorie"], + "fy": ["Kategory"], + "ga": ["Rang", "Catagóir"], + "gag": ["Kategori", "Kategoriya"], + "gan": ["分類", "分类"], + "gd": ["Roinn-seòrsa"], + "gl": ["Categoría"], + "glk": ["جرگه", "رده"], + "gn": ["Ñemohenda"], + "gom": ["वर्ग", "श्रेणी"], + "gor": ["Dalala"], + "got": ["𐌷𐌰𐌽𐍃𐌰"], + "gu": ["શ્રેણી", "CAT", "શ્રે"], + "gv": ["Ronney"], + "hak": ["分類"], + "haw": ["Māhele"], + "he": ["קטגוריה", "קט"], + "hi": ["श्र", "श्रेणी"], + "hif": ["vibhag"], + "hr": ["CT", "KT", "Kategorija"], + "hsb": ["Kategorija"], + "ht": ["Kategori"], + "hu": ["Kategória"], + "hy": ["Կատեգորիա"], + "ia": ["Categoria"], + "id": ["Kategori"], + "ie": ["Categorie"], + "ig": ["Ébéonọr", "Òtù"], + "ii": ["分类"], + "ilo": ["Kategoria"], + "inh": ["ОагӀат"], + "io": ["Kategorio"], + "is": ["Flokkur"], + "it": ["CAT", "Categoria"], + "ja": ["カテゴリ"], + "jbo": ["klesi"], + "jv": ["Kategori"], + "ka": ["კატეგორია"], + "kaa": ["Sanat", "Kategoriya", "Санат", "سانات"], + "kab": ["Taggayt"], + "kbd": ["Категория", "Категориэ"], + "kbp": ["Catégorie"], + "kg": ["Kalasi"], + "kk": ["Sanat", "Санат", "سانات"], + "kl": ["Sumut_atassuseq", "Kategori", "Sumut atassuseq"], + "km": ["ចំនាត់ថ្នាក់ក្រុម", "ចំណាត់ក្រុម", "ចំណាត់ថ្នាក់ក្រុម"], + "kn": ["ವರ್ಗ"], + "ko": ["분류"], + "koi": ["Категория"], + "krc": ["Категория"], + "ks": ["زٲژ"], + "ksh": [ + "Saachjropp", + "Saachjrop", + "Katejori", + "Kategorie", + "Saachjrupp", + "Kattejori", + "Sachjrop", + ], + "ku": ["Kategorî", "پۆل"], + "kv": ["Категория"], + "kw": ["Class", "Klass"], + "ky": ["Категория"], + "la": ["Categoria"], + "lad": ["Kateggoría", "Katēggoría", "Categoría"], + "lb": ["Kategorie"], + "lbe": ["Категория"], + "lez": ["Категория"], + "lfn": ["Categoria"], + "li": ["Categorie", "Kategorie"], + "lij": ["Categorîa", "Categoria"], + "lmo": ["Categuria", "Categoria"], + "ln": ["Catégorie"], + "lo": ["ໝວດ"], + "lrc": ["دأسە"], + "lt": ["Kategorija"], + "ltg": ["Kategoreja"], + "lv": ["Kategorija"], + "mai": ["CA", "श्रेणी"], + "map-bms": ["Kategori"], + "mdf": ["Категорие", "Категория"], + "mg": ["Sokajy", "Catégorie"], + "mhr": ["Категория", "Категорий"], + "min": ["Kategori"], + "mk": ["Категорија"], + "ml": ["വിഭാഗം", "വി", "വർഗ്ഗം", "വ"], + "mn": ["Ангилал"], + "mr": ["वर्ग"], + "mrj": ["Категори", "Категория"], + "ms": ["Kategori"], + "mt": ["Kategorija"], + "mwl": ["Catadorie", "Categoria"], + "my": ["ကဏ္ဍ"], + "myv": ["Категория"], + "mzn": ["رج", "رده"], + "nah": ["Neneuhcāyōtl", "Categoría"], + "nap": ["Categurìa", "Categoria"], + "nds": ["Kategorie"], + "nds-nl": ["Categorie", "Kattegerie", "Kategorie"], + "ne": ["श्रेणी"], + "new": ["पुचः"], + "nl": ["Categorie"], + "nn": ["Kategori"], + "no": ["Kategori"], + "nrm": ["Catégorie"], + "nso": ["Setensele"], + "nv": ["Tʼááłáhági_átʼéego", "Tʼááłáhági átʼéego"], + "oc": ["Categoria"], + "olo": ["Kategourii"], + "or": ["ବିଭାଗ", "ଶ୍ରେଣୀ"], + "os": ["Категори"], + "pa": ["ਸ਼੍ਰੇਣੀ"], + "pcd": ["Catégorie"], + "pdc": ["Abdeeling", "Kategorie"], + "pfl": ["Kadegorie", "Sachgrubb", "Kategorie"], + "pi": ["विभाग"], + "pl": ["Kategoria"], + "pms": ["Categorìa"], + "pnb": ["گٹھ"], + "pnt": ["Κατηγορίαν"], + "ps": ["وېشنيزه"], + "pt": ["Categoria"], + "qu": ["Katiguriya"], + "rm": ["Categoria"], + "rmy": ["Shopni"], + "ro": ["Categorie"], + "roa-rup": ["Categorie"], + "roa-tara": ["Categoria"], + "ru": ["Категория", "К"], + "rue": ["Категория", "Катеґорія"], + "rw": ["Ikiciro"], + "sa": ["वर्गः"], + "sah": ["Категория"], + "sat": ["ᱛᱷᱚᱠ"], + "sc": ["Categoria"], + "scn": ["Catigurìa"], + "sd": ["زمرو"], + "se": ["Kategoriija"], + "sg": ["Catégorie"], + "sh": ["Kategorija", "Категорија"], + "si": ["ප්‍රවර්ගය"], + "sk": ["Kategória"], + "sl": ["Kategorija"], + "sq": ["Kategoria", "Kategori"], + "sr": ["Kategorija", "Категорија"], + "srn": ["Categorie", "Guru"], + "stq": ["Kategorie"], + "su": ["Kategori"], + "sv": ["Kategori"], + "sw": ["Jamii"], + "szl": ["Kategoryjo", "Kategoria"], + "ta": ["பகுப்பு"], + "tcy": ["ವರ್ಗೊ"], + "te": ["వర్గం"], + "tet": ["Kategoría", "Kategoria"], + "tg": ["Гурӯҳ"], + "th": ["หมวดหมู่"], + "ti": ["መደብ"], + "tk": ["Kategoriýa"], + "tl": ["Kategorya", "Kaurian"], + "tpi": ["Grup"], + "tr": ["Kategori", "KAT"], + "tt": ["Төркем", "Törkem", "Категория"], + "ty": ["Catégorie"], + "tyv": ["Аңгылал", "Категория"], + "udm": ["Категория"], + "ug": ["تۈر"], + "uk": ["Категория", "Категорія"], + "ur": ["زمرہ"], + "uz": ["Turkum", "Kategoriya"], + "vec": ["Categoria"], + "vep": ["Kategorii"], + "vi": ["Thể_loại", "Thể loại"], + "vls": ["Categorie"], + "vo": ["Klad"], + "wa": ["Categoreye"], + "war": ["Kaarangay"], + "wo": ["Wàll", "Catégorie"], + "wuu": ["分类"], + "xal": ["Янз", "Әәшл"], + "xmf": ["კატეგორია"], + "yi": ["קאטעגאריע", "קאַטעגאָריע"], + "yo": ["Ẹ̀ka"], + "za": ["分类"], + "zea": ["Categorie"], + "zh": ["分类", "分類", "CAT"], + "zh-classical": ["分類", "CAT"], + "zh-min-nan": ["分類", "Lūi-pia̍t"], + "zh-yue": ["分类", "分類", "类", "類"], +} + + +def simple_node_class(mwnode: Any, lang: str = "en") -> str: # noqa: PLR0911, PLR0912 + """e.g., "" -> "Heading".""" + if isinstance(mwnode, str): + return "Text" + else: + nc = str(type(mwnode)).split(".")[-1].split("'")[0] + if nc == "Wikilink": + n_prefix = mwnode.title.split(":", maxsplit=1)[0].lower() + if n_prefix in [ + m.lower() for m in MEDIA_PREFIXES + MEDIA_ALIASES.get(lang, []) + ]: + nc = "Media" + elif n_prefix in [ + c.lower() for c in CAT_PREFIXES + CAT_ALIASES.get(lang, []) + ]: + nc = "Category" + elif nc == "Tag": + tag_type = str(mwnode.tag).lower() + if tag_type in TEXT_FORMATTING_TAGS: + return "Text Formatting" + elif tag_type in LIST_TAGS: + return "List" + elif tag_type == "table": + return "Table" + elif tag_type in TABLE_ELEMENTS_TAGS: + return "Table Element" + elif tag_type == "gallery": + return "Gallery" + elif tag_type == "ref": + return "Reference" + elif tag_type == "noinclude": + return "Comment" + # any others I missed -- e.g., div, meta, etc. + else: + return "Other Tag" + return nc + + +def wikitext_to_plaintext( + wt: str, lang: str, result_type: str | None = None +) -> list[tuple[str, str]] | str: + """Helper function for converting wikitext to plaintext. + + NOTE: I show a few options here but ultimately we'll probably just implement + one set of logic that performs the filtering we want. + """ + plaintext = [extract_text(n, lang) for n in mw.parse(wt).nodes] + if result_type == "plaintext": + return "".join([p[1] for p in plaintext]) + elif result_type == "skip-text-formatting": + return "".join([p[1] for p in plaintext if p[0] != "Text Formatting"]) + elif result_type == "skip-lists": + text = "" + in_list = False + for p in plaintext: + nodetype = p[0] + nodetext = p[1] + if nodetype == "List": + in_list = True + elif in_list: + if "\n" in nodetext: + in_list = False + text += nodetext.split("\n", maxsplit=1)[1] + else: + text += nodetext + return text + else: + return plaintext + + +def extract_text(mwnode: Any, lang: str) -> tuple[str, str]: + """Extract what text would be displayed from any node. + + This is a best effort to convert wikitext to plaintext for any given node. + For some I just skip -- e.g., Comments, Templates -- but we could also add + logic to extract the text from them too if we wanted. + """ + ntype = simple_node_class(mwnode, lang) + plaintext = "" + if ntype == "Text": + plaintext = str(mwnode) + elif ntype == "HTMLEntity": + plaintext = mwnode.normalize() + elif ntype == "Wikilink": + # retain entire link + plaintext = str(mwnode) + # if mwnode.text: + # plaintext = "[[" + str(mwnode.text) + "]]" + # else: + # plaintext = "[[" + str(mwnode.title) + "]]" + elif ntype == "ExternalLink" and mwnode.title: + plaintext = mwnode.title.strip_code() + # tables can have tons of nested references etc. so can't just go through + # standard strip_code + elif ntype == "Table": + # don't collapse whitespace for tables because otherwise strip_code + # sometimes merges text across cells + plaintext = ( + "" # ignore table content # mwnode.contents.strip_code(collapse=False) + ) + elif ntype == "Text Formatting": + plaintext = "".join(extract_text(mwn, lang)[1] for mwn in mwnode.contents.nodes) + # Heading, Template, Comment, Argument, Category, Media, References, URLs + # without display text. + # Tags not listed here (div, gallery, etc.) that almost never have true text content + # and can be super messy. + # Table elements (they duplicate the text if included). + return (ntype, plaintext)