The-Pocket · zachary62 · Apr 24, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 24, 2025
diff --git a/utils/crawl_github_files.py b/utils/crawl_github_files.py
@@ -135,21 +135,77 @@ def should_include_file(file_path: str, file_name: str) -> bool:
     owner = path_parts[0]
     repo = path_parts[1]
 
-    # Check if URL contains a specific branch/commit
-    if 'tree' in path_parts:
-        tree_index = path_parts.index('tree')
-        ref = path_parts[tree_index + 1]
-        # Combine all parts after the ref as the path
-        path_start = tree_index + 2
-        specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
-    else:
-        ref = "main"  # Default branch
-        specific_path = ""
-
     # Setup for GitHub API
     headers = {"Accept": "application/vnd.github.v3+json"}
     if token:
         headers["Authorization"] = f"token {token}"
+
+    def fetch_branches(owner: str, repo: str):
+        """Get brancshes of the repository"""
+
+        url = f"https://api.github.com/repos/{owner}/{repo}/branches"
+        response = requests.get(url, headers=headers)
+
+        if response.status_code == 404:
+            if not token:
+                print(f"Error 404: Repository not found or is private.\n"
+                      f"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable.")
+            else:
+                print(f"Error 404: Repository not found or insufficient permissions with the provided token.\n"
+                      f"Please verify the repository exists and the token has access to this repository.")
+            return []
+
+        if response.status_code != 200:
+            print(f"Error fetching the branches of {owner}/{path}: {response.status_code} - {response.text}")
+            return []
+
+        return response.json()
+
+    def check_tree(owner: str, repo: str, tree: str):
+        """Check the repository has the given tree"""
+
+        url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{tree}"
+        response = requests.get(url, headers=headers)
+
+        return True if response.status_code == 200 else False 
+
+    # Check if URL contains a specific branch/commit
+    if len(path_parts) > 2 and 'tree' == path_parts[2]:
+        join_parts = lambda i: '/'.join(path_parts[i:])
+
+        branches = fetch_branches(owner, repo)
+        branch_names = map(lambda branch: branch.get("name"), branches)
+
+        # Fetching branches is not successfully
+        if len(branches) == 0:
+            return
+
+        # To check branch name
+        relevant_path = join_parts(3)
+
+        # Find a match with relevant path and get the branch name
+        filter_gen = (name for name in branch_names if relevant_path.startswith(name))
+        ref = next(filter_gen, None)
+
+        # If match is not found, check for is it a tree
+        if ref == None:
+            tree = path_parts[3]
+            ref = tree if check_tree(owner, repo, tree) else None
+
+        # If it is neither a tree nor a branch name
+        if ref == None:
+            print(f"The given path does not match with any branch and any tree in the repository.\n"
+                  f"Please verify the path is exists.")
+            return
+
+        # Combine all parts after the ref as the path
+        part_index = 5 if '/' in ref else 4
+        specific_path = join_parts(part_index) if part_index < len(path_parts) else ""
+    else:
+        # Dont put the ref param to quiery
+        # and let Github decide default branch
+        ref = None
+        specific_path = ""
 
     # Dictionary to store path -> content mapping
     files = {}
@@ -158,7 +214,7 @@ def should_include_file(file_path: str, file_name: str) -> bool:
     def fetch_contents(path):
         """Fetch contents of the repository at a specific path and commit"""
         url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
-        params = {"ref": ref}
+        params = {"ref": ref} if ref != None else {}
 
         response = requests.get(url, headers=headers, params=params)