Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 68 additions & 12 deletions utils/crawl_github_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,21 +135,77 @@ def should_include_file(file_path: str, file_name: str) -> bool:
owner = path_parts[0]
repo = path_parts[1]

# Check if URL contains a specific branch/commit
if 'tree' in path_parts:
tree_index = path_parts.index('tree')
ref = path_parts[tree_index + 1]
# Combine all parts after the ref as the path
path_start = tree_index + 2
specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
else:
ref = "main" # Default branch
specific_path = ""

# Setup for GitHub API
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"token {token}"

def fetch_branches(owner: str, repo: str):
"""Get brancshes of the repository"""

url = f"https://api.github.com/repos/{owner}/{repo}/branches"
response = requests.get(url, headers=headers)

if response.status_code == 404:
if not token:
print(f"Error 404: Repository not found or is private.\n"
f"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable.")
else:
print(f"Error 404: Repository not found or insufficient permissions with the provided token.\n"
f"Please verify the repository exists and the token has access to this repository.")
return []

if response.status_code != 200:
print(f"Error fetching the branches of {owner}/{path}: {response.status_code} - {response.text}")
return []

return response.json()

def check_tree(owner: str, repo: str, tree: str):
"""Check the repository has the given tree"""

url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree}"
response = requests.get(url, headers=headers)

return True if response.status_code == 200 else False

# Check if URL contains a specific branch/commit
if len(path_parts) > 2 and 'tree' == path_parts[2]:
join_parts = lambda i: '/'.join(path_parts[i:])

branches = fetch_branches(owner, repo)
branch_names = map(lambda branch: branch.get("name"), branches)

# Fetching branches is not successfully
if len(branches) == 0:
return

# To check branch name
relevant_path = join_parts(3)

# Find a match with relevant path and get the branch name
filter_gen = (name for name in branch_names if relevant_path.startswith(name))
ref = next(filter_gen, None)

# If match is not found, check for is it a tree
if ref == None:
tree = path_parts[3]
ref = tree if check_tree(owner, repo, tree) else None

# If it is neither a tree nor a branch name
if ref == None:
print(f"The given path does not match with any branch and any tree in the repository.\n"
f"Please verify the path is exists.")
return

# Combine all parts after the ref as the path
part_index = 5 if '/' in ref else 4
specific_path = join_parts(part_index) if part_index < len(path_parts) else ""
else:
# Dont put the ref param to quiery
# and let Github decide default branch
ref = None
specific_path = ""

# Dictionary to store path -> content mapping
files = {}
Expand All @@ -158,7 +214,7 @@ def should_include_file(file_path: str, file_name: str) -> bool:
def fetch_contents(path):
"""Fetch contents of the repository at a specific path and commit"""
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
params = {"ref": ref}
params = {"ref": ref} if ref != None else {}

response = requests.get(url, headers=headers, params=params)

Expand Down