Skip to content

Commit adb7df0

Browse files
committed
feat: complete cross-platform support for all the supported browsers + can now explore multiple default profiles for Chrome and Edge (but not Profile 2 and more, only Profile 1) + speed optimizations + more robust pandas dataframe construction and indexing + bump v0.3.4
Signed-off-by: Stephen L. <LRQ3000@gmail.com>
1 parent eec99e4 commit adb7df0

File tree

2 files changed

+96
-65
lines changed

2 files changed

+96
-65
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project] # beware if using setuptools: setup.py still gets executed, and even if pyproject.toml fields take precedence, if there is any code error in setup.py, building will fail!
99
name = "webactogram" # renamed from online-actogram according to PEP 423 https://peps.python.org/pep-0423/#pick-meaningful-names
10-
version = "0.3.3" # see PEP 440 https://peps.python.org/pep-0440/#pre-releases and https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
10+
version = "0.3.4" # see PEP 440 https://peps.python.org/pep-0440/#pre-releases and https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
1111
description = "Actogram from browsers history, may help to screen sleep-wake patterns & disorders!"
1212
authors = [
1313
{name = "Barrett F. Davis", email = "barrettfdavis@gmail.com"},

src/webactogram/webactogram.py

Lines changed: 95 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import configparser
4141
import glob
4242
import os
43+
import random
4344
import shlex
4445
import sys
4546
import sqlite3
@@ -51,6 +52,8 @@
5152
from itertools import groupby
5253
# Typing
5354
from collections.abc import Sequence
55+
# Path
56+
from pathlib import Path
5457

5558
# Scientific stack
5659
import numpy as np
@@ -127,6 +130,7 @@ class ImportData:
127130
def __init__(self, act):
128131
super().__init__()
129132
self.act = act
133+
self.history_loc_dict_temp = [] # temporary dictionary to store the filepaths of the temporary history files and which browser they relate to (for SQLite queries) -- we copy history files to a temporary folder to avoid modifying the original ones
130134

131135
self.__main__()
132136

@@ -157,51 +161,75 @@ def lookup_history_filepaths(self):
157161
""" check which OS user is running script from, then
158162
check typical file paths for popular browser history files """
159163

160-
home = os.path.expanduser("~")
164+
#home = os.path.expanduser("~")
165+
home = str(Path.home())
166+
history_filepaths = {} # dictionary to store filepaths for each browser. Structure is: {browser: [filepath1, filepath2, ...]} because there can be multiple filpaths for the browser history, multiple profiles per browser, and sometimes the default profile is not named Default (eg for Chrome).
161167

162168
if sys.platform == "darwin": # Darwin == OSX
163-
safari_src = os.path.join(home, 'Library/Safari/History.db')
164-
chrome_src = os.path.join(home, 'Library/Application Support/Google/Chrome/Default/History')
165-
firefox_src = os.path.join(self.find_firefox_profile(home), 'places.sqlite')
166-
edge_src = None # TODO
169+
history_filepaths['safari'] = [os.path.join(home, 'Library/Safari/History.db')]
170+
history_filepaths['chrome'] = [os.path.join(home, 'Library/Application Support/Google/Chrome/Default/History'),
171+
os.path.join(home, 'Library/Application Support/Google/Chrome/Profile 1/History'),
172+
os.path.join(home, 'Library/Application Support/Google/Chrome/Guest/History')
173+
]
174+
history_filepaths['firefox'] = [os.path.join(self.find_firefox_profile(home), 'places.sqlite')]
175+
history_filepaths['edge'] = [os.path.join(home, 'Library/Application Support/Microsoft Edge/Default/History'),
176+
os.path.join(home, 'Library/Application Support/Microsoft Edge/Profile 1/History'),
177+
os.path.join(home, 'Library/Application Support/Microsoft Edge/Guest/History')
178+
]
167179

168180
elif sys.platform == "win32":
169-
safari_src = None
170-
chrome_src = home + '/AppData/Local/Google/Chrome/User Data/Default/History'
171-
firefox_src = os.path.join(self.find_firefox_profile(home), 'places.sqlite')
172-
edge_src = home + '/AppData/Local/Microsoft/Edge/User Data/Default/History'
181+
# Note: when using os.path.join(), make sure there is no leading '/', otherwise it will treat it as an absolute path and forget the home directory
182+
history_filepaths['safari'] = [os.path.join(home, 'AppData/Local/Safari/History.db')]
183+
history_filepaths['chrome'] = [os.path.join(home, 'AppData/Local/Google/Chrome/User Data/Default/History'),
184+
os.path.join(home, 'AppData/Local/Google/Chrome/User Data/Profile 1/History'),
185+
os.path.join(home, 'AppData/Local/Google/Chrome/User Data/Guest/History')
186+
]
187+
history_filepaths['firefox'] = [os.path.join(self.find_firefox_profile(home), 'places.sqlite')]
188+
history_filepaths['edge'] = [os.path.join(home, 'AppData/Local/Microsoft/Edge/User Data/Default/History'),
189+
os.path.join(home, 'AppData/Local/Microsoft/Edge/User Data/Profile 1/History'),
190+
os.path.join(home, 'AppData/Local/Microsoft/Edge/User Data/Guest/History')
191+
]
173192

174193
elif sys.platform == "linux":
175-
safari_src = None
176-
chrome_src = None # TODO
177-
firefox_src = os.path.join(self.find_firefox_profile(home), 'places.sqlite')
178-
edge_src = None # TODO
194+
history_filepaths['safari'] = [os.path.join(home, '.config/safari/History.db')]
195+
history_filepaths['chrome'] = [os.path.join(home, '.config/google-chrome/Default/History'),
196+
os.path.join(home, '.config/google-chrome/Profile 1/History'),
197+
os.path.join(home, '.config/google-chrome/Guest/History')
198+
]
199+
history_filepaths['firefox'] = [os.path.join(self.find_firefox_profile(home), 'places.sqlite')]
200+
history_filepaths['edge'] = [os.path.join(home, '.config/microsoft-edge/Default/History'),
201+
os.path.join(home, '.config/microsoft-edge/Profile 1/History'),
202+
os.path.join(home, '.config/microsoft-edge/Guest/History')
203+
]
179204

180205
else:
181206
print('Sorry, having trouble with your operating system.')
182207
sys.exit()
183208

184-
self.history_loc_dict = {'safari': [safari_src, 'History.db'],
185-
'chrome': [chrome_src, 'History'],
186-
'firefox': [firefox_src, 'places.sqlite'],
187-
'edge': [edge_src, 'History']
188-
}
209+
self.history_loc_dict = history_filepaths
189210

190211
def copy_history_to_temp_folder(self):
191212
""" Iterate through each file referenced in the history_loc_dict
192213
and copy to some temporary folder. This avoids direclty operating
193214
on the user's browsers' history files. """
194-
for key, value in self.history_loc_dict.items():
195-
src, fname = value
215+
for browser, pathslist in self.history_loc_dict.items():
216+
for path in pathslist:
217+
if path is not None and os.path.exists(path) and os.path.isfile(path):
218+
# If the file exists, copy it to a temporary folder
219+
self.copy_history_func(browser, path)
196220

197-
if src is not None:
198-
self.copy_history_func(src, fname)
199-
200-
201-
def copy_history_func(self, src, fname, dst_folder='temp_history'):
221+
def copy_history_func(self, browser, src, dst_folder='temp_history'):
202222
""" function to copy file at given file location to temporary folder"""
203223
os.makedirs(dst_folder, exist_ok=True)
224+
fname = Path(src).name # get the filename from the path
204225
dst = os.path.join(dst_folder, fname)
226+
# Test if destination file already exists
227+
if os.path.exists(dst):
228+
# If it already exists (eg, multiple profiles for one browser), change destination folder dst to append a random number to the filename to avoid collision
229+
dst = os.path.join(dst_folder, fname + str(hex(random.getrandbits(65))[2:-1]))
230+
231+
# Since the output can be different from input, we need to create a new dict to map and remember what are the browsers (to know which SQLite commands to send)
232+
self.history_loc_dict_temp.append([browser, dst])
205233

206234
try:
207235
copy(src, dst)
@@ -220,34 +248,35 @@ def copy_history_func(self, src, fname, dst_folder='temp_history'):
220248
def import_history_to_working_memory(self):
221249
""" Imports all the files in the temporary folder into working
222250
memory. Each browser's particular history file format is
223-
standardized before concatenating to an overarching df"""
224-
for key, value in self.history_loc_dict.items():
225-
src, fname = value
226-
227-
if src is not None:
228-
if not os.path.isfile(src):
229-
continue
230-
231-
if key == 'safari':
232-
command_str = 'SELECT datetime(visit_time+978307200, "unixepoch",\
233-
"localtime") FROM history_visits ORDER BY visit_time DESC;'
234-
235-
elif key == 'chrome':
236-
command_str = "SELECT datetime(last_visit_time/1000000-11644473600,\
237-
'unixepoch','localtime'), url FROM urls ORDER BY last_visit_time DESC;"
238-
239-
elif key == 'firefox':
240-
command_str = 'SELECT datetime(visit_date/1000000,\
241-
"unixepoch", "localtime") FROM moz_historyvisits ORDER BY visit_date ASC;'
242-
pass
243-
244-
elif key == 'edge':
245-
command_str = "SELECT datetime(last_visit_time/1000000-11644473600,\
246-
'unixepoch','localtime'), url FROM urls ORDER BY last_visit_time DESC;"
247-
248-
temp_src = os.path.join('temp_history', fname)
249-
df = self._import_history_func(temp_src, command_str)
250-
self.act.df = pd.concat([self.act.df, df])
251+
standardized before concatenating to an overarching df.
252+
This effectively merges all histories from various browsers and profiles."""
253+
254+
# All the sql commands to extract the history data from the different browsers SQLite databases
255+
sql_commands = {
256+
'safari':
257+
'SELECT datetime(visit_time+978307200, "unixepoch",\
258+
"localtime") FROM history_visits ORDER BY visit_time DESC;',
259+
'chrome':
260+
"SELECT datetime(last_visit_time/1000000-11644473600,\
261+
'unixepoch','localtime') FROM urls ORDER BY last_visit_time DESC;",
262+
'firefox':
263+
'SELECT datetime(visit_date/1000000,\
264+
"unixepoch", "localtime") FROM moz_historyvisits ORDER BY visit_date ASC;',
265+
'edge':
266+
"SELECT datetime(last_visit_time/1000000-11644473600,\
267+
'unixepoch','localtime') FROM urls ORDER BY last_visit_time DESC;"
268+
}
269+
270+
df_list = [] # list of dataframes to concatenate outside of the loop (faster than concatenating inside the loop, otherwise memory is reallocated at each iteration so complexity is O(N^2) quadratic)
271+
272+
# For each browsers' history file, import the data into a pandas dataframe and add into a list
273+
for (browser, path) in self.history_loc_dict_temp:
274+
if path is not None and os.path.exists(path) and os.path.isfile(path):
275+
# If the file doesn't exist, skip it
276+
df = self._import_history_func(path, sql_commands[browser])
277+
df_list.append(df) # add the dataframe to the list of dataframes, complexity O(1)
278+
# Concatenate all dataframes at once, complexity is then O(N)
279+
self.act.df = pd.concat(df_list)
251280

252281
def delete_temporary_history_folder(self):
253282
""" Delete the temporary folder after files are copied into working
@@ -259,13 +288,13 @@ def delete_temporary_history_folder(self):
259288
def _import_history_func(self, file_name, command_str):
260289
""" Function to open SQL styled history files and convert to a pandas
261290
DataFrame type. SQL objects are closed after copying to Pandas DF. """
262-
cnx = sqlite3.connect(file_name)
263-
df = pd.read_sql_query(command_str, cnx)
264-
cnx.commit()
265-
cnx.close()
291+
cnx = sqlite3.connect(file_name) # connect to the SQLite database
292+
df = pd.read_sql_query(command_str, cnx) # read the SQL query into a pandas dataframe
293+
cnx.commit() # commit changes (this is necessary to close the connection, and is why we copy the history files to a temporary folder beforehand to avoid tampering the originals)
294+
cnx.close() # close the connection
266295

267-
df.rename(inplace=True, columns={df.columns[0]: 'visit_time'})
268-
df = pd.to_datetime(df['visit_time'], errors='coerce').dropna()
296+
df.rename(inplace=True, columns={df.columns[0]: 'visit_time'}) # rename the column to 'visit_time' for consistency
297+
df['visit_time'] = pd.to_datetime(df['visit_time'], errors='coerce').dropna() # drop NaT values (and keep it as a DataFrame, because it will always return a Series since we are manipulating a single column)
269298

270299
return df
271300

@@ -308,7 +337,7 @@ def aggregate_visits_by_freq(self):
308337
rows corresponding to all the time intervals (e.g. 5 min)
309338
in the input dataframe's date range. Output row values are the
310339
number of visits within each time interval. """
311-
visits = pd.to_datetime(self.df.iloc[:, 0])
340+
visits = pd.to_datetime(self.df.loc[:, 'visit_time'])
312341
self.df = pd.DataFrame({'visits': np.ones(len(visits))}, index=visits)
313342
self.df = self.df.resample(self.act.freq).agg({'visits': 'sum'})
314343
self.df = self.df.fillna(0)
@@ -339,7 +368,9 @@ def pre_allocate_binned_df(self):
339368
self.binned_df = bdf
340369

341370
def clip_date_range(self):
342-
first_visit = self.df.ne(0).idxmax()[0]
371+
first_visit = self.df.ne(0) # creates a boolean mask where each element is True if the corresponding element in self.df is not equal to 0, and False otherwise.
372+
first_visit = first_visit.idxmax() # returns the index of the first occurrence of the maximum value in the Series. If the Series is all True/False values, then this will be the index of the first True value.
373+
first_visit = first_visit.iloc[0] # indexing the Series returned by idxmax().
343374
dt_first_visit = dt.combine(first_visit, dt.min.time())
344375
if self.act.start <= dt_first_visit: self.act_start = dt_first_visit
345376

@@ -636,23 +667,23 @@ def subplot_the_timeshare(self, ax, ref_ax):
636667
def plot_subplot_titles(self, ax, fig_ax):
637668
p = self.plot_params
638669

639-
increments =int(60/(self.freq_no/(24)))
670+
steps = int(60/(self.freq_no/(24)))
640671

641672
if self.landscape:
642673
ax.text(1, 1+p['hspace']/2, p['labels'][0], ha='right')
643674
ax.text(1, p['hspace'], p['labels'][1], ha='right')
644675

645676
s = ("Approximate sleep-wake periods, generated from time stamped "
646677
"internet browser searches\nbetween {:%d-%b-%Y} and {:%d-%b-%Y}. "
647-
"Increments of {} minutes.".format(self.act.dd[0], self.act.dd[-1], increments))
678+
"Window steps of {} minutes.".format(self.act.dd[0], self.act.dd[-1], steps))
648679

649680
else:
650681
ax.text(1, 1-p['hspace'], p['labels'][0], ha='right')
651682
ax.text(1, p['hspace']/2, p['labels'][1], ha='right')
652683

653684
s = ("Approximate sleep-wake periods, generated from time stamped "
654685
"internet browser searches between {:%d-%b-%Y} and {:%d-%b-%Y}. "
655-
"Increments of {} minutes.".format(self.act.dd[0], self.act.dd[-1], increments))
686+
"Window steps of {} minutes.".format(self.act.dd[0], self.act.dd[-1], steps))
656687

657688
fig_ax.text(x=0, y=1.1, s='Double-Plotted Online Actogram',
658689
ha='left', va='bottom', fontweight='bold', wrap=True)

0 commit comments

Comments
 (0)