11import os
22import hashlib
3+ from datetime import datetime , timedelta
4+
35import pandas as pd
4- from typing import Dict , List
6+ from typing import Dict , List , Optional
7+
8+ from utils .config import DATA_CACHE
9+ from utils .utils import period_to_timedelta
510
611
7- def _make_cache_key (ticker : str , start_date : str , end_date : str , source : str ) -> str :
8- key = f" { ticker } _ { start_date } _ { end_date } _ { source } "
12+ def _make_cache_key (* args , ** kwargs ) -> str :
13+ key = '_' . join ( args )
914 return hashlib .md5 (key .encode ()).hexdigest ()
1015
1116
12- def _fetch_data (ticker : str , start_date : str , end_date : str , source : str ) -> pd .DataFrame :
17+ def _fetch_data (ticker : str , start_date : str , end_date : str , interval : str , source : str ) -> pd .DataFrame :
1318 if source == "yahoo" :
1419 from data_ingestion .yahoo_fetcher import fetch_yahoo_data
15- return fetch_yahoo_data (ticker , start_date , end_date )
20+ return fetch_yahoo_data (ticker , start_date , end_date , interval )
1621 elif source == "alpaca" :
1722 from data_ingestion .alpaca_fetcher import fetch_alpaca_data
18- return fetch_alpaca_data (ticker , start_date , end_date )
23+ return fetch_alpaca_data (ticker , start_date , end_date , interval )
1924 elif source == "polygon" :
2025 from data_ingestion .polygon_fetcher import fetch_polygon_data
21- return fetch_polygon_data (ticker , start_date , end_date )
26+ return fetch_polygon_data (ticker , start_date , end_date , interval )
2227 else :
2328 raise ValueError (f"Unsupported data source: { source } . Supported sources are 'yahoo', 'alpaca', and 'polygon'." )
2429
2530
26- def load_price_data (ticker : str , start_date : str , end_date : str ,
31+ def load_price_data (ticker : str , end_date : str ,
32+ start_date : Optional [str ] = None ,
33+ interval : str = '1d' ,
2734 use_cache : bool = True ,
2835 force_refresh : bool = False ,
2936 source : str = "yahoo" ) -> pd .DataFrame :
3037 """
3138 Load historical OHLCV data for a single ticker from the specified data source.
3239
3340 Args:
34- ticker (str): The ticker ticker of the security.
35- start_date (str): The start date of the data range.
41+ ticker (str): The ticker of the security.
3642 end_date (str): The end date of the data range.
43+ start_date (str, optional): The start date of the data range.
44+ interval (str): The data interval.
45+ period (int): The data period.
3746 use_cache (bool, optional): Whether to use cached data. Defaults to True.
3847 force_refresh (bool, optional): Whether to force a refresh of the data. Defaults to False.
3948 source (str, optional): The data source to use. Defaults to "yahoo".
4049
4150 Returns:
4251 pd.DataFrame: A pandas DataFrame containing the historical OHLCV data.
4352 """
44- os .makedirs ("./data_cache" , exist_ok = True )
45- cache_key = _make_cache_key (ticker , start_date , end_date , source )
46- cache_path = os .path .join ("./data_cache" , f"{ cache_key } .parquet" )
53+ os .makedirs (DATA_CACHE , exist_ok = True )
54+ cache_key = _make_cache_key (ticker , start_date , end_date , interval , source )
55+ cache_path = os .path .join (DATA_CACHE , f"{ cache_key } .parquet" )
4756
4857 if use_cache and os .path .exists (cache_path ) and not force_refresh :
4958 try :
5059 return pd .read_parquet (cache_path )
5160 except Exception :
5261 print (f"⚠️ Cache corrupted at { cache_path } , refetching..." )
5362
54- df = _fetch_data (ticker , start_date , end_date , source )
63+ df = _fetch_data (ticker , start_date , end_date , interval , source )
64+
65+ try :
66+ df .index = df .index .tz_localize ("UTC" )
67+ except :
68+ df .index = df .index .tz_convert ("UTC" )
5569
5670 if df .empty or "Close" not in df .columns :
5771 raise ValueError (f"No data returned for { ticker } from { start_date } to { end_date } " )
@@ -69,18 +83,38 @@ def __init__(self, use_cache=True, force_refresh=False, source="yahoo"):
6983 self .force_refresh = force_refresh
7084 self .source = source
7185
72- def get_data (self , tickers : List [str ], start_date : str , end_date : str ) -> Dict [str , pd .DataFrame ] | pd .DataFrame :
86+ def get_data (
87+ self , tickers : List [str ], end_date : str , start_date : Optional [str ] = None ,
88+ interval : Optional [str ] = '1d' , period : Optional [int ] = '5y' ,
89+ ) -> Dict [str , pd .DataFrame ] | pd .DataFrame :
7390 """
7491 Return a dictionary of {ticker: DataFrame} for all requested tickers.
7592
7693 Args:
7794 tickers (List[str]): A list of ticker symbols.
7895 start_date (str): The start date of the data range.
7996 end_date (str): The end date of the data range.
97+ interval (str): The data interval.
98+ period (int): The data period.
8099
81100 Returns:
82101 Dict[str, pd.DataFrame]: A dictionary containing the historical OHLCV data for each ticker.
83102 """
103+ start = pd .to_datetime (start_date ) if start_date else datetime .today ().date ()
104+ end = pd .to_datetime (end_date ) if end_date else datetime .today ().date ()
105+
106+ # IF start after end
107+ # OR, if interval in minutes, but period > 60
108+ if start >= end :
109+ period_int = period_to_timedelta (period )
110+ start -= period_int
111+ start_date = start .strftime ('%Y-%m-%d' )
112+
113+ elif interval .endswith ("m" ) and (end - start ).days >= 60 : # Yahoo-finance limitation
114+ start = end - timedelta (days = 59 )
115+ start_date = start .strftime ('%Y-%m-%d' )
116+
117+ data = {}
84118 # TODO: Convert Dict structure to a single Multi-indexed dataframe?
85119 # data = load_price_data(
86120 # tickers,
@@ -90,12 +124,12 @@ def get_data(self, tickers: List[str], start_date: str, end_date: str) -> Dict[s
90124 # force_refresh=self.force_refresh,
91125 # source=self.source
92126 # )
93- data = {}
94127 for ticker in tickers :
95128 data [ticker ] = load_price_data (
96129 ticker ,
97- start_date ,
98- end_date ,
130+ start_date = start_date ,
131+ end_date = end_date ,
132+ interval = interval ,
99133 use_cache = self .use_cache ,
100134 force_refresh = self .force_refresh ,
101135 source = self .source
0 commit comments