1212
1313# all the external dependencies are imported inside the functions,
1414# so we can use this file in other projects without installing them.
15- # or to copy and paste the functions to other projects.
15+ # or to copy and paste the functions to other public projects directly .
1616# to install them all, you can do:
1717# pip install requests lxml beautifulsoup4 python-dateutil pytz pyperclip wcwidth rich browser_cookie3
1818
19- # The first version must start with screen_name and ended with type[index][ dupe].suffix
20- # The second one allows optional arbitrary prefix or suffix.
21- # NOTE: to make it simpler, the returned m['extra'] and m['dupe'] will have leading space or hyphen with it.
19+ # # ==================== CONSTANTS ====================
20+
21+ '''Twitter media name handle
22+ The filename format I use, inherited from good old twMediaDownloader (RIP: here is a mirror: https://github.com/fireattack/twMediaDownloader).
23+ The first version must start with screen_name and ended with type[index][ dupe].suffix
24+ The second one allows optional arbitrary prefix or suffix.
25+ NOTE: to make it simpler, the returned m['extra'] and m['dupe'] will have leading space or hyphen with it.
26+ '''
2227TWITTER_FILENAME_RE = re .compile (r'^(?P<screen_name>\w+)-(?P<id>\d+)-(?P<date>\d{8})_(?P<time>\d{6})-(?P<type>[^-.]+?)(?P<index>\d*)(?P<dupe> *\(\d+\))?(?P<suffix>\.(?:mp4|zip|jpg|png))$' )
2328TWITTER_FILENAME_RELEXED_RE = re .compile (r'^(?:(?P<prefix>.+?)(?: +?|[-]??))??(?P<screen_name>\w+)-(?P<id>\d+)-(?P<date>\d{8})_(?P<time>\d{6})-(?P<type>[^-.]+?)(?P<index>\d*)(?P<extra>[ _-].+?)??(?P<dupe> *\(\d+\))?(?P<suffix>\.(?:mp4|zip|jpg|png))$' )
2429
2530
31+ # ==================== data structure manipulation & misc. ====================
2632def to_list (a ):
2733 return a if not a or isinstance (a , list ) else [a ]
2834
@@ -48,6 +54,20 @@ def get_clipboard_data():
4854 return pyperclip .paste ()
4955
5056def safeify (name , ignore_backslash = False ):
57+ """
58+ Replaces illegal characters in a given name with safe alternatives.
59+
60+ Args:
61+ name (str): The name to be made safe.
62+ ignore_backslash (bool, optional): Whether to ignore backslashes. Defaults to False.
63+
64+ Returns:
65+ str: The safe version of the name.
66+
67+ Raises:
68+ AssertionError: If the name is not a string.
69+ """
70+
5171 assert isinstance (name , str ), f'Name must be a string, not { type (name )} '
5272
5373 template = {'\\ ' : '\' , '/' : '/' , ':' : ':' , '*' : '*' , '?' : '?' , '"' : '"' , '<' : '<' , '>' : '>' , '|' : '|' ,'\n ' :'' ,'\r ' :'' ,'\t ' :'' }
@@ -59,6 +79,19 @@ def safeify(name, ignore_backslash=False):
5979 return name
6080
6181def format_str (s , width = None , align = 'left' , padding = ' ' ):
82+ """
83+ Format a string `s` with a specified width, alignment, and padding.
84+
85+ Args:
86+ s (str): The string to be formatted.
87+ width (int, optional): The desired width of the formatted string. If not provided, the original string will be returned as is. Defaults to None.
88+ align (str, optional): The alignment of the formatted string. Possible values are 'left', 'right', and 'center'. Defaults to 'left'.
89+ padding (str, optional): The padding character used to fill the remaining space in the formatted string. Defaults to ' '.
90+
91+ Returns:
92+ str: The formatted string.
93+
94+ """
6295 # pip install wcwidth
6396 import wcwidth
6497
@@ -395,6 +428,7 @@ def td_format(td_object_or_sec, long_form=True):
395428 else :
396429 return "" .join (strings )
397430
431+ # deprecated, just use MyTime(dt).jst()
398432def to_jp_time (dt , input_timezone = None ):
399433 # pip install python-dateutil pytz
400434 from dateutil import parser
@@ -418,6 +452,7 @@ def tac(print=True):
418452 builtins .print (f'Time passed: { t :.2f} s' )
419453 return t
420454
455+ # a decorator to time a function
421456def timeme (func ):
422457 def wrapper (* args , ** kwargs ):
423458 start_time = time .time ()
@@ -480,6 +515,10 @@ def remove_empty_folders(directory, remove_root=True): #Including root.
480515 print ('Error:' , e )
481516
482517def ensure_nonexist (f ):
518+ '''
519+ Ensure the file does not exist. If it does, rename it to filename_2, filename_3, etc.
520+ '''
521+
483522 i = 2
484523 stem = f .stem
485524 if m := re .search (r'^(.+?)_(\d)$' , stem ):
@@ -494,6 +533,9 @@ def ensure_nonexist(f):
494533 return f
495534
496535def ensure_path_exists (path ):
536+ '''
537+ ensure the path exists.
538+ '''
497539 path = Path (path )
498540 if not path .exists ():
499541 raise FileNotFoundError (f'Path { path } does not exist!' )
@@ -522,39 +564,72 @@ def quickmd5(f):
522564 return f"{ file_size } _{ hasher .hexdigest ()} " # this way is more readable than just return the hexdigest.
523565
524566def move_or_delete_duplicate (src , dst , verbose = True , conflict = 'error' ):
525- if not src .exists ():
526- raise FileNotFoundError (f"The source file { src } does not exist." )
527- if src == dst :
528- raise ValueError (f"Source and destination are the same: { src } " )
529- if dst .exists ():
530- if quickmd5 (dst ) == quickmd5 (src ):
531- print (f'[W] { src .name } is a duplicate. Remove.' )
532- src .unlink ()
567+ """
568+ Move or delete a file if it is a duplicate.
569+
570+ Args:
571+ src (str): The path to the source file.
572+ dst (str): The path to the destination file.
573+ verbose (bool, optional): Whether to print verbose output. Defaults to True.
574+ conflict (str, optional): The conflict resolution strategy. Can be one of 'error', 'skip', or 'rename'.
575+ Defaults to 'error'.
576+
577+ Raises:
578+ FileNotFoundError: If the source file does not exist.
579+ ValueError: If the source and destination paths are the same.
580+ FileExistsError: If the destination file already exists and the conflict resolution strategy is 'error'.
581+
582+ Returns:
583+ None
584+ """
585+ if not src .exists ():
586+ raise FileNotFoundError (f"The source file { src } does not exist." )
587+ if src == dst :
588+ raise ValueError (f"Source and destination are the same: { src } " )
589+ if dst .exists ():
590+ if quickmd5 (dst ) == quickmd5 (src ):
591+ print (f'[W] { src .name } is a duplicate. Remove.' )
592+ src .unlink ()
593+ return
594+ else :
595+ if conflict == 'skip' :
596+ print (f'[W] Destination file { src .name } already exists and hash does not match. Skip.' )
533597 return
534- else :
535- if conflict == 'skip' :
536- print (f'[W] Destination file { src .name } already exists and hash does not match. Skip.' )
537- return
538- elif conflict == 'error' :
539- raise FileExistsError (f"Destination file { dst } already exists." )
540- elif conflict == 'rename' :
541- dst = ensure_nonexist (dst )
542- print (f'[W] Destination file { src .name } already exists. Use filename { dst .name } instead.' )
543- if verbose :
544- if src .parent == dst .parent :
545- print (f"Rename { src .name } to { dst .name } " )
546- elif src .name == dst .name :
547- print (f'Move { src .name } into { dst .parent } ' )
548- else :
549- print (f'Move { src .name } to { dst } ' )
550- shutil .move (src , dst )
598+ elif conflict == 'error' :
599+ raise FileExistsError (f"Destination file { dst } already exists." )
600+ elif conflict == 'rename' :
601+ dst = ensure_nonexist (dst )
602+ print (f'[W] Destination file { src .name } already exists. Use filename { dst .name } instead.' )
603+ if verbose :
604+ if src .parent == dst .parent :
605+ print (f"Rename { src .name } to { dst .name } " )
606+ elif src .name == dst .name :
607+ print (f'Move { src .name } into { dst .parent } ' )
608+ else :
609+ print (f'Move { src .name } to { dst } ' )
610+ shutil .move (src , dst )
551611
552612def batch_rename (renamings ):
553613 """
554- Non-conflict batch rename.
555- renamings: list of (Path object, newname)
556- """
614+ Batch rename files without conflicts.
615+
616+ Args:
617+ renamings (list): A list of tuples containing the original file paths and the new names.
618+
619+ Returns:
620+ None
557621
622+ Raises:
623+ None
624+
625+ This function renames multiple files simultaneously without causing conflicts. It checks for duplicate files
626+ in the list, duplicate new filenames, and conflicts with existing files. If any conflicts are detected, the
627+ function prints an error message and aborts the renaming process.
628+
629+ Example usage:
630+ renamings = [(Path('file1.txt'), 'new_file1.txt'), (Path('file2.txt'), 'new_file2.txt')]
631+ batch_rename(renamings)
632+ """
558633 files = [f for f , _ in renamings ]
559634 dst_files = [f .with_name (name ) for f , name in renamings ]
560635
@@ -584,13 +659,26 @@ def batch_rename(renamings):
584659 f .rename (f .with_name (name ))
585660
586661# ==================== network related ====================
587- # Modified from https://www.peterbe.com/plog/best-practice-with-retries-with-requests
588662def requests_retry_session (
589663 retries = 5 ,
590664 backoff_factor = 0.2 ,
591665 status_forcelist = (502 , 503 , 504 ),
592666 session = None ,
593667):
668+ """
669+ Create a session object with retry functionality for making HTTP requests.
670+ Modified from https://www.peterbe.com/plog/best-practice-with-retries-with-requests
671+
672+ Args:
673+ retries (int): The maximum number of retries for each request. Default is 5.
674+ backoff_factor (float): The backoff factor between retries. Default is 0.2.
675+ status_forcelist (tuple): A tuple of HTTP status codes that should trigger a retry. Default is (502, 503, 504).
676+ session (requests.Session): An existing session object to use. If not provided, a new session will be created.
677+
678+ Returns:
679+ requests.Session: The session object with retry functionality.
680+
681+ """
594682 # pip install requests urllib3
595683 import requests
596684 from requests .adapters import HTTPAdapter
@@ -608,7 +696,29 @@ def requests_retry_session(
608696 return session
609697
610698def get (url , headers = None , cookies = None , encoding = None , session = None , parser = 'lxml' , timeout = None ):
611- # pip install requests lxml beautifulsoup4
699+ """
700+ Sends a GET request to the specified URL and returns the parsed HTML content.
701+
702+ Args:
703+ url (str): The URL to send the GET request to.
704+ headers (dict, optional): The headers to include in the request. Defaults to None.
705+ cookies (dict, optional): The cookies to include in the request. Defaults to None.
706+ encoding (str, optional): The encoding to use when parsing the HTML content. Defaults to None.
707+ session (requests.Session, optional): The session to use for the request. Defaults to None.
708+ parser (str, optional): The parser to use for parsing the HTML content. Defaults to 'lxml'.
709+ timeout (float, optional): The maximum number of seconds to wait for the request to complete. Defaults to None.
710+
711+ Returns:
712+ BeautifulSoup: The parsed HTML content.
713+
714+ Raises:
715+ Any exceptions raised by the underlying requests library.
716+
717+ Dependencies:
718+ - requests
719+ - lxml
720+ - beautifulsoup4
721+ """
612722 from bs4 import BeautifulSoup
613723
614724 if not session :
@@ -622,6 +732,27 @@ def get(url, headers=None, cookies=None, encoding=None, session=None, parser='lx
622732def get_webname (url ):
623733 return unquote (url .split ('?' )[0 ].split ('/' )[- 1 ])
624734
735+ def load_cookie (s ):
736+ """
737+ Load cookies from various sources and convert them to a `RequestsCookieJar` object.
738+
739+ Args:
740+ s (str): The input string, file path containing the cookies, or "{browser_name}/{domain_name}" to load cookies from a browser.
741+
742+ Returns:
743+ requests.cookies.RequestsCookieJar: The converted `RequestsCookieJar` object.
744+
745+ Raises:
746+ ValueError: If the input string is invalid.
747+
748+ Examples:
749+ >>> load_cookie('cookie1=value1; cookie2=value2')
750+ <RequestsCookieJar[Cookie(version=0, name='cookie1', value='value1', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False), Cookie(version=0, name='cookie2', value='value2', port=None, port_specified=False, domain='', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)]>
751+
752+ >>> load_cookie('/path/to/cookies.txt')
753+ <RequestsCookieJar[Cookie(version=0, name='cookie1', value='value1', port=None, port_specified=False, domain='example.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False), Cookie(version=0, name='cookie2', value='value2', port=None, port_specified=False, domain='example.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=False, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)]>
754+ """
755+ # Function implementation goes here
625756def load_cookie (s ):
626757 from http .cookiejar import MozillaCookieJar
627758 from requests .cookies import RequestsCookieJar , create_cookie
@@ -669,10 +800,33 @@ def convert(cj):
669800
670801 raise ValueError (f'Invalid cookie string: { s } ' )
671802
672-
673803def download (url , filename = None , save_path = '.' , cookies = None , session = None , dry_run = False ,
674804 dupe = 'skip_same_size' , referer = None , headers = None , placeholder = True , prefix = '' ,
675805 get_suffix = True , verbose = 2 , retry_failed = True ):
806+ """
807+ Downloads a file from the given URL and saves it to the specified location.
808+
809+ Args:
810+ url (str): The URL of the file to download.
811+ filename (str, optional): The name of the file to save. If not provided, the filename will be extracted from the URL or the response header. Defaults to None.
812+ save_path (str, optional): The directory path to save the file. Defaults to '.' (current directory).
813+ cookies (dict, optional): A dictionary of cookies to include in the request. Defaults to None.
814+ session (requests.Session, optional): A requests Session object to use for the request. Defaults to None.
815+ dry_run (bool, optional): If True, only prints the URL and does not perform the actual download. Defaults to False.
816+ dupe (str, optional): The method to handle duplicate files. Must be one of 'skip', 'overwrite', 'rename', or 'skip_same_size'. Defaults to 'skip_same_size'.
817+ referer (str, optional): The referer header to include in the request. Defaults to None.
818+ headers (dict, optional): Additional headers to include in the request. Defaults to None.
819+ placeholder (bool, optional): If True, creates a placeholder file with a '.broken' extension if the download fails. Defaults to True.
820+ prefix (str, optional): A prefix to add to the filename. Useful when fetching the filename from the response headers. Defaults to ''.
821+ get_suffix (bool, optional): If True, attempts to determine the file extension from the response headers. Defaults to True.
822+ verbose (int, optional): The verbosity level of the download progress. Must be 0, 1, or 2. Defaults to 2.
823+ retry_failed (bool, optional): If True, retries the download if it fails. Defaults to True.
824+
825+ Returns:
826+ str: The status of the download. Can be 'Dry run', 'Exists', or the HTTP status code if the download fails.
827+ """
828+ # Rest of the code...
829+
676830 from cgi import parse_header
677831 from mimetypes import guess_extension
678832
0 commit comments