Skip to content

Commit f4c6fda

Browse files
committed
handle multi-step redirects correctly
1 parent bf9be0e commit f4c6fda

File tree

1 file changed

+18
-7
lines changed

1 file changed

+18
-7
lines changed

wikipedia2vec/dump_db.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,24 @@ def redirects(self) -> Iterator[Tuple[str, str]]:
104104
for key, value in iter(cur):
105105
yield (key.decode("utf-8"), value.decode("utf-8"))
106106

107-
def resolve_redirect(self, title: str) -> str:
108-
with self._env.begin(db=self._redirect_db) as txn:
109-
value = txn.get(title.encode("utf-8"))
110-
if value:
111-
return value.decode("utf-8")
112-
else:
113-
return title
107+
def resolve_redirect(self, title: str, max_steps: int = 10) -> str:
108+
visited = set([title])
109+
cur_title = title
110+
for _ in range(max_steps):
111+
with self._env.begin(db=self._redirect_db) as txn:
112+
value = txn.get(cur_title.encode("utf-8"))
113+
if value:
114+
cur_title = value.decode("utf-8")
115+
if cur_title in visited:
116+
logger.warn(f"Detected redirect loop: {title}")
117+
return title
118+
visited.add(cur_title)
119+
120+
else:
121+
return cur_title
122+
123+
logger.warn(f"Max steps ({max_steps}) exceeded when resolving redirect: {title}")
124+
return cur_title
114125

115126
def is_redirect(self, title: str) -> bool:
116127
with self._env.begin(db=self._redirect_db) as txn:

0 commit comments

Comments
 (0)