diff --git a/training/add_a_link/src/add_a_link/release/export_hdfs_table_to_pkl.py b/training/add_a_link/src/add_a_link/release/export_hdfs_table_to_pkl.py index 54dd01ef536b6e2e526ee119473f3d7d8bab9384..8d696e8212daa64806998096d3752885b5efeb62 100644 --- a/training/add_a_link/src/add_a_link/release/export_hdfs_table_to_pkl.py +++ b/training/add_a_link/src/add_a_link/release/export_hdfs_table_to_pkl.py @@ -2,9 +2,9 @@ import os import pickle import click -import tqdm from add_a_link.release.release_utils import copy_from_local_to_hdfs from pyspark.sql import SparkSession +from tqdm import tqdm def run(spark: SparkSession, directory: str): diff --git a/training/add_a_link/src/add_a_link/release/export_tables.py b/training/add_a_link/src/add_a_link/release/export_tables.py index 4b45f51c542ec953e5a9efdcfd035f0a3b6bda6a..fee3e99573962a648c27fc9041ba1756602e7ed1 100644 --- a/training/add_a_link/src/add_a_link/release/export_tables.py +++ b/training/add_a_link/src/add_a_link/release/export_tables.py @@ -1,14 +1,17 @@ import gzip +import os import subprocess import click -from add_a_link.release.mysql_utils import get_connection_dict, read_password +from add_a_link.release.mysql_utils import get_mysql_connection, read_password from add_a_link.release.release_utils import copy_from_local_to_hdfs def run(directory: str, wiki_id: str, maria_db_password_file_hdfs_path: str): + os.makedirs(directory, exist_ok=True) read_password(maria_db_password_file_hdfs_path) - connection_dict = get_connection_dict() + mysql_conn = get_mysql_connection() + cursor = mysql_conn.cursor() list_fname = ["anchors", "pageids", "redirects", "w2vfiltered"] table_prefix = "lr" for table in list_fname: @@ -16,33 +19,20 @@ def run(directory: str, wiki_id: str, maria_db_password_file_hdfs_path: str): print(f"Exporting table : {table_name}") filename = f"{directory}/{table_name}.sql.gz" - mysqldump_command = [ - "mysqldump", - ] - mysqldump_command += [ - "--skip-opt", - "--no-create-info", - "--skip-extended-insert", - "--skip-create-options", - f"-u{connection_dict['user']}", - ] + cursor.execute(f"SELECT * FROM {table_name}") + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + lines = [] + for row in rows: + values = [ + mysql_conn.escape(val) if val is not None else "NULL" for val in row + ] + stmt = f"INSERT INTO `{table_name}` ({', '.join(columns)}) VALUES ({', '.join(values)});\n" + # keep the raw bytes + lines.append(stmt.encode("utf-8", errors="surrogateescape")) - mysqldump_command += [f"-p{connection_dict['password']}"] - - mysqldump_command += [ - "--lock-tables=false", - "-h", - connection_dict["host"], - "--port", - str(connection_dict["port"]), - connection_dict["database"], - table_name, - ] with gzip.open(filename, "wb") as gzip_file: - mysqldump = subprocess.Popen( - mysqldump_command, stdout=subprocess.PIPE, bufsize=-1 - ) - gzip_file.writelines(mysqldump.stdout) + gzip_file.writelines(lines) gzip_file.close() copy_from_local_to_hdfs(filename, filename) checksum_filename = f"{filename}.checksum" diff --git a/training/add_a_link/src/add_a_link/release/mysql_utils.py b/training/add_a_link/src/add_a_link/release/mysql_utils.py index 99f86814bbd310dd2f8ef660f18ff9d51cac82ab..871252b546c0dfb9fba92532d4f6d6e2b039856f 100644 --- a/training/add_a_link/src/add_a_link/release/mysql_utils.py +++ b/training/add_a_link/src/add_a_link/release/mysql_utils.py @@ -2,6 +2,7 @@ import os import fsspec import pymysql +from add_a_link.utils import configure_fsspec from dotenv import load_dotenv pymysql.install_as_MySQLdb() @@ -10,6 +11,7 @@ import MySQLdb # noqa: E402 load_dotenv() +configure_fsspec() fs_hdfs = fsspec.filesystem("hdfs") diff --git a/training/add_a_link/src/add_a_link/release/release_utils.py b/training/add_a_link/src/add_a_link/release/release_utils.py index 67ee4cb94376b7449f3166a15523103a0f97625c..5f517bf63589ae35bfe16c0b0e7472efd62042fb 100644 --- a/training/add_a_link/src/add_a_link/release/release_utils.py +++ b/training/add_a_link/src/add_a_link/release/release_utils.py @@ -1,5 +1,9 @@ +import os + import fsspec +from add_a_link.utils import configure_fsspec +configure_fsspec() fs_local = fsspec.filesystem("file") fs_hdfs = fsspec.filesystem("hdfs") @@ -11,6 +15,7 @@ def copy_from_local_to_hdfs(local_path: str, hdfs_path: str): def copy_from_hdfs_to_local(hdfs_path: str, local_path: str): + os.makedirs("/".join(local_path.split("/")[:-1]), exist_ok=True) with fs_hdfs.open(hdfs_path, "rb") as fsrc: with fs_local.open(local_path, "wb") as fdst: fdst.write(fsrc.read())