Skip to content

Commit 258f5bb

Browse files
committed
util: add some docstrings
1 parent 8f75b47 commit 258f5bb

File tree

1 file changed

+189
-35
lines changed

1 file changed

+189
-35
lines changed

util.py

Lines changed: 189 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,23 @@
1212

1313
# all the external dependencies are imported inside the functions,
1414
# so we can use this file in other projects without installing them.
15-
# or to copy and paste the functions to other projects.
15+
# or to copy and paste the functions to other public projects directly.
1616
# to install them all, you can do:
1717
# pip install requests lxml beautifulsoup4 python-dateutil pytz pyperclip wcwidth rich browser_cookie3
1818

19-
# The first version must start with screen_name and ended with type[index][ dupe].suffix
20-
# The second one allows optional arbitrary prefix or suffix.
21-
# NOTE: to make it simpler, the returned m['extra'] and m['dupe'] will have leading space or hyphen with it.
19+
# # ==================== CONSTANTS ====================
20+
21+
'''Twitter media name handle
22+
The filename format I use, inherited from good old twMediaDownloader (RIP: here is a mirror: https://github.com/fireattack/twMediaDownloader).
23+
The first version must start with screen_name and ended with type[index][ dupe].suffix
24+
The second one allows optional arbitrary prefix or suffix.
25+
NOTE: to make it simpler, the returned m['extra'] and m['dupe'] will have leading space or hyphen with it.
26+
'''
2227
TWITTER_FILENAME_RE = re.compile(r'^(?P<screen_name>\w+)-(?P<id>\d+)-(?P<date>\d{8})_(?P<time>\d{6})-(?P<type>[^-.]+?)(?P<index>\d*)(?P<dupe> *\(\d+\))?(?P<suffix>\.(?:mp4|zip|jpg|png))$')
2328
TWITTER_FILENAME_RELEXED_RE = re.compile(r'^(?:(?P<prefix>.+?)(?: +?|[-]??))??(?P<screen_name>\w+)-(?P<id>\d+)-(?P<date>\d{8})_(?P<time>\d{6})-(?P<type>[^-.]+?)(?P<index>\d*)(?P<extra>[ _-].+?)??(?P<dupe> *\(\d+\))?(?P<suffix>\.(?:mp4|zip|jpg|png))$')
2429

2530

31+
# ==================== data structure manipulation & misc. ====================
2632
def to_list(a):
2733
return a if not a or isinstance(a, list) else [a]
2834

@@ -48,6 +54,20 @@ def get_clipboard_data():
4854
return pyperclip.paste()
4955

5056
def safeify(name, ignore_backslash=False):
57+
"""
58+
Replaces illegal characters in a given name with safe alternatives.
59+
60+
Args:
61+
name (str): The name to be made safe.
62+
ignore_backslash (bool, optional): Whether to ignore backslashes. Defaults to False.
63+
64+
Returns:
65+
str: The safe version of the name.
66+
67+
Raises:
68+
AssertionError: If the name is not a string.
69+
"""
70+
5171
assert isinstance(name, str), f'Name must be a string, not {type(name)}'
5272

5373
template = {'\\': '\', '/': '/', ':': ':', '*': '*', '?': '?', '"': '"', '<': '<', '>': '>', '|': '|','\n':'','\r':'','\t':''}
@@ -59,6 +79,19 @@ def safeify(name, ignore_backslash=False):
5979
return name
6080

6181
def format_str(s, width=None, align='left', padding=' '):
82+
"""
83+
Format a string `s` with a specified width, alignment, and padding.
84+
85+
Args:
86+
s (str): The string to be formatted.
87+
width (int, optional): The desired width of the formatted string. If not provided, the original string will be returned as is. Defaults to None.
88+
align (str, optional): The alignment of the formatted string. Possible values are 'left', 'right', and 'center'. Defaults to 'left'.
89+
padding (str, optional): The padding character used to fill the remaining space in the formatted string. Defaults to ' '.
90+
91+
Returns:
92+
str: The formatted string.
93+
94+
"""
6295
# pip install wcwidth
6396
import wcwidth
6497

@@ -395,6 +428,7 @@ def td_format(td_object_or_sec, long_form=True):
395428
else:
396429
return "".join(strings)
397430

431+
# deprecated, just use MyTime(dt).jst()
398432
def to_jp_time(dt, input_timezone=None):
399433
# pip install python-dateutil pytz
400434
from dateutil import parser
@@ -418,6 +452,7 @@ def tac(print=True):
418452
builtins.print(f'Time passed: {t:.2f} s')
419453
return t
420454

455+
# a decorator to time a function
421456
def timeme(func):
422457
def wrapper(*args, **kwargs):
423458
start_time = time.time()
@@ -480,6 +515,10 @@ def remove_empty_folders(directory, remove_root=True): #Including root.
480515
print('Error:', e)
481516

482517
def ensure_nonexist(f):
518+
'''
519+
Ensure the file does not exist. If it does, rename it to filename_2, filename_3, etc.
520+
'''
521+
483522
i = 2
484523
stem = f.stem
485524
if m:= re.search(r'^(.+?)_(\d)$', stem):
@@ -494,6 +533,9 @@ def ensure_nonexist(f):
494533
return f
495534

496535
def ensure_path_exists(path):
536+
'''
537+
ensure the path exists.
538+
'''
497539
path = Path(path)
498540
if not path.exists():
499541
raise FileNotFoundError(f'Path {path} does not exist!')
@@ -522,39 +564,72 @@ def quickmd5(f):
522564
return f"{file_size}_{hasher.hexdigest()}" # this way is more readable than just return the hexdigest.
523565

524566
def move_or_delete_duplicate(src, dst, verbose=True, conflict='error'):
525-
if not src.exists():
526-
raise FileNotFoundError(f"The source file {src} does not exist.")
527-
if src == dst:
528-
raise ValueError(f"Source and destination are the same: {src}")
529-
if dst.exists():
530-
if quickmd5(dst) == quickmd5(src):
531-
print(f'[W] {src.name} is a duplicate. Remove.')
532-
src.unlink()
567+
"""
568+
Move or delete a file if it is a duplicate.
569+
570+
Args:
571+
src (str): The path to the source file.
572+
dst (str): The path to the destination file.
573+
verbose (bool, optional): Whether to print verbose output. Defaults to True.
574+
conflict (str, optional): The conflict resolution strategy. Can be one of 'error', 'skip', or 'rename'.
575+
Defaults to 'error'.
576+
577+
Raises:
578+
FileNotFoundError: If the source file does not exist.
579+
ValueError: If the source and destination paths are the same.
580+
FileExistsError: If the destination file already exists and the conflict resolution strategy is 'error'.
581+
582+
Returns:
583+
None
584+
"""
585+
if not src.exists():
586+
raise FileNotFoundError(f"The source file {src} does not exist.")
587+
if src == dst:
588+
raise ValueError(f"Source and destination are the same: {src}")
589+
if dst.exists():
590+
if quickmd5(dst) == quickmd5(src):
591+
print(f'[W] {src.name} is a duplicate. Remove.')
592+
src.unlink()
593+
return
594+
else:
595+
if conflict == 'skip':
596+
print(f'[W] Destination file {src.name} already exists and hash does not match. Skip.')
533597
return
534-
else:
535-
if conflict == 'skip':
536-
print(f'[W] Destination file {src.name} already exists and hash does not match. Skip.')
537-
return
538-
elif conflict == 'error':
539-
raise FileExistsError(f"Destination file {dst} already exists.")
540-
elif conflict == 'rename':
541-
dst = ensure_nonexist(dst)
542-
print(f'[W] Destination file {src.name} already exists. Use filename {dst.name} instead.')
543-
if verbose:
544-
if src.parent == dst.parent:
545-
print(f"Rename {src.name} to {dst.name}")
546-
elif src.name == dst.name:
547-
print(f'Move {src.name} into {dst.parent}')
548-
else:
549-
print(f'Move {src.name} to {dst}')
550-
shutil.move(src, dst)
598+
elif conflict == 'error':
599+
raise FileExistsError(f"Destination file {dst} already exists.")
600+
elif conflict == 'rename':
601+
dst = ensure_nonexist(dst)
602+
print(f'[W] Destination file {src.name} already exists. Use filename {dst.name} instead.')
603+
if verbose:
604+
if src.parent == dst.parent:
605+
print(f"Rename {src.name} to {dst.name}")
606+
elif src.name == dst.name:
607+
print(f'Move {src.name} into {dst.parent}')
608+
else:
609+
print(f'Move {src.name} to {dst}')
610+
shutil.move(src, dst)
551611

552612
def batch_rename(renamings):
553613
"""
554-
Non-conflict batch rename.
555-
renamings: list of (Path object, newname)
556-
"""
614+
Batch rename files without conflicts.
615+
616+
Args:
617+
renamings (list): A list of tuples containing the original file paths and the new names.
618+
619+
Returns:
620+
None
557621
622+
Raises:
623+
None
624+
625+
This function renames multiple files simultaneously without causing conflicts. It checks for duplicate files
626+
in the list, duplicate new filenames, and conflicts with existing files. If any conflicts are detected, the
627+
function prints an error message and aborts the renaming process.
628+
629+
Example usage:
630+
renamings = [(Path('file1.txt'), 'new_file1.txt'), (Path('file2.txt'), 'new_file2.txt')]
631+
batch_rename(renamings)
632+
"""
558633
files = [f for f, _ in renamings]
559634
dst_files = [f.with_name(name) for f, name in renamings]
560635

@@ -584,13 +659,26 @@ def batch_rename(renamings):
584659
f.rename(f.with_name(name))
585660

586661
# ==================== network related ====================
587-
# Modified from https://www.peterbe.com/plog/best-practice-with-retries-with-requests
588662
def requests_retry_session(
589663
retries=5,
590664
backoff_factor=0.2,
591665
status_forcelist=(502, 503, 504),
592666
session=None,
593667
):
668+
"""
669+
Create a session object with retry functionality for making HTTP requests.
670+
Modified from https://www.peterbe.com/plog/best-practice-with-retries-with-requests
671+
672+
Args:
673+
retries (int): The maximum number of retries for each request. Default is 5.
674+
backoff_factor (float): The backoff factor between retries. Default is 0.2.
675+
status_forcelist (tuple): A tuple of HTTP status codes that should trigger a retry. Default is (502, 503, 504).
676+
session (requests.Session): An existing session object to use. If not provided, a new session will be created.
677+
678+
Returns:
679+
requests.Session: The session object with retry functionality.
680+
681+
"""
594682
# pip install requests urllib3
595683
import requests
596684
from requests.adapters import HTTPAdapter
@@ -608,7 +696,29 @@ def requests_retry_session(
608696
return session
609697

610698
def get(url, headers=None, cookies=None, encoding=None, session=None, parser='lxml', timeout=None):
611-
# pip install requests lxml beautifulsoup4
699+
"""
700+
Sends a GET request to the specified URL and returns the parsed HTML content.
701+
702+
Args:
703+
url (str): The URL to send the GET request to.
704+
headers (dict, optional): The headers to include in the request. Defaults to None.
705+
cookies (dict, optional): The cookies to include in the request. Defaults to None.
706+
encoding (str, optional): The encoding to use when parsing the HTML content. Defaults to None.
707+
session (requests.Session, optional): The session to use for the request. Defaults to None.
708+
parser (str, optional): The parser to use for parsing the HTML content. Defaults to 'lxml'.
709+
timeout (float, optional): The maximum number of seconds to wait for the request to complete. Defaults to None.
710+
711+
Returns:
712+
BeautifulSoup: The parsed HTML content.
713+
714+
Raises:
715+
Any exceptions raised by the underlying requests library.
716+
717+
Dependencies:
718+
- requests
719+
- lxml
720+
- beautifulsoup4
721+
"""
612722
from bs4 import BeautifulSoup
613723

614724
if not session:
@@ -622,6 +732,27 @@ def get(url, headers=None, cookies=None, encoding=None, session=None, parser='lx
622732
def get_webname(url):
623733
return unquote(url.split('?')[0].split('/')[-1])
624734

735+
def load_cookie(s):
736+
"""
737+
Load cookies from various sources and convert them to a `RequestsCookieJar` object.
738+
739+
Args:
740+
s (str): The input string, file path containing the cookies, or "{browser_name}/{domain_name}" to load cookies from a browser.
741+
742+
Returns:
743+
requests.cookies.RequestsCookieJar: The converted `RequestsCookieJar` object.
744+
745+
Raises:
746+
ValueError: If the input string is invalid.
747+
748+
Examples:
749+
>>> load_cookie('cookie1=value1; cookie2=value2')
750+
<RequestsCookieJar[Cookie(version=0, name='cookie1', value='value1', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False), Cookie(version=0, name='cookie2', value='value2', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)]>
751+
752+
>>> load_cookie('/path/to/cookies.txt')
753+
<RequestsCookieJar[Cookie(version=0, name='cookie1', value='value1', port=None, port_specified=False, domain='example.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False), Cookie(version=0, name='cookie2', value='value2', port=None, port_specified=False, domain='example.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)]>
754+
"""
755+
# Function implementation goes here
625756
def load_cookie(s):
626757
from http.cookiejar import MozillaCookieJar
627758
from requests.cookies import RequestsCookieJar, create_cookie
@@ -669,10 +800,33 @@ def convert(cj):
669800

670801
raise ValueError(f'Invalid cookie string: {s}')
671802

672-
673803
def download(url, filename=None, save_path='.', cookies=None, session=None, dry_run=False,
674804
dupe='skip_same_size', referer=None, headers=None, placeholder=True, prefix='',
675805
get_suffix=True, verbose=2, retry_failed=True):
806+
"""
807+
Downloads a file from the given URL and saves it to the specified location.
808+
809+
Args:
810+
url (str): The URL of the file to download.
811+
filename (str, optional): The name of the file to save. If not provided, the filename will be extracted from the URL or the response header. Defaults to None.
812+
save_path (str, optional): The directory path to save the file. Defaults to '.' (current directory).
813+
cookies (dict, optional): A dictionary of cookies to include in the request. Defaults to None.
814+
session (requests.Session, optional): A requests Session object to use for the request. Defaults to None.
815+
dry_run (bool, optional): If True, only prints the URL and does not perform the actual download. Defaults to False.
816+
dupe (str, optional): The method to handle duplicate files. Must be one of 'skip', 'overwrite', 'rename', or 'skip_same_size'. Defaults to 'skip_same_size'.
817+
referer (str, optional): The referer header to include in the request. Defaults to None.
818+
headers (dict, optional): Additional headers to include in the request. Defaults to None.
819+
placeholder (bool, optional): If True, creates a placeholder file with a '.broken' extension if the download fails. Defaults to True.
820+
prefix (str, optional): A prefix to add to the filename. Useful when fetching the filename from the response headers. Defaults to ''.
821+
get_suffix (bool, optional): If True, attempts to determine the file extension from the response headers. Defaults to True.
822+
verbose (int, optional): The verbosity level of the download progress. Must be 0, 1, or 2. Defaults to 2.
823+
retry_failed (bool, optional): If True, retries the download if it fails. Defaults to True.
824+
825+
Returns:
826+
str: The status of the download. Can be 'Dry run', 'Exists', or the HTTP status code if the download fails.
827+
"""
828+
# Rest of the code...
829+
676830
from cgi import parse_header
677831
from mimetypes import guess_extension
678832

0 commit comments

Comments
 (0)