-
Notifications
You must be signed in to change notification settings - Fork 12
Open
Description
env
python 3.11
python-calamine==0.3.1
code
def get_table_header(self) -> StructureInfo:
_, source = self._get_extension_and_source()
import python_calamine as calamine
workbook = calamine.load_workbook(source)
sheets = workbook.sheet_names
column_info: dict[str, ColumnList] = {}
for sheet_name in sheets:
# 读取工作表中的数据
sheet = workbook.get_sheet_by_name(sheet_name)
data = sheet.iter_rows()
header = next(data)
df = pd.DataFrame(data, columns=header)
# 获取列信息(表头)
column_info[sheet_name] = self._get_table_header(df)
del sheet, df
break # 如果只需要处理一个工作表,使用break退出循环
workbook.close()
del workbookI has tried many way to make memory not grow up.
like 1) with load_workbook() as workbook: 2) sheet.to_python()
memory info:
- ps auxf
# before read a 20MB excel
root 3752960 78.0 3.5 1705596 718732 pts/0 Sl+ 19:52 0:16 \_ /aipaas/.venv/bin/python /aipaas/.venv/bin/uvicorn --app-dir /aipaas/src aipaas.main:app --host 0.0.0.0
# after read a 20MB excel
root 3752960 1.7 4.2 1914828 853372 pts/0 Sl+ 19:52 0:22 \_ /aipaas/.venv/bin/python /aipaas/.venv/bin/uvicorn --app-dir /aipaas/src aipaas.main:app --host 0.0.0.0
and didnot go down after a long time like one hour
- memory_profiler
=============================================================
33 725.4 MiB 725.4 MiB 1 @profile
34 def get_table_header(self) -> StructureInfo:
35 725.4 MiB 0.0 MiB 1 _, source = self._get_extension_and_source()
36
37 # with pd.ExcelFile(source, engine="calamine") as workbook:
38 # sheets = workbook.sheet_names
39 #
40 # column_info: dict[str, ColumnList] = {}
41 # for sheet_name in sheets:
42 # df = workbook.parse(sheet_name=sheet_name, header=None, dtype=str, na_filter=False, **self.excel_args)
43 # column_info[sheet_name] = self._get_table_header(df)
44 # del df
45 # break # 仅返回第一个sheet
46 725.4 MiB 0.0 MiB 1 import sys
47 725.6 MiB 0.2 MiB 1 import python_calamine as calamine
48 793.0 MiB 67.4 MiB 1 workbook = calamine.load_workbook(source)
49 793.0 MiB 0.0 MiB 1 print(f"workbook size: {sys.getsizeof(workbook)}")
50 793.0 MiB 0.0 MiB 1 sheets = workbook.sheet_names
51
52 793.0 MiB 0.0 MiB 1 column_info: dict[str, ColumnList] = {}
53 793.0 MiB 0.0 MiB 1 for sheet_name in sheets:
54 # 读取工作表中的数据
55 796.8 MiB 3.8 MiB 1 sheet = workbook.get_sheet_by_name(sheet_name)
56 796.8 MiB 0.0 MiB 1 print(f"sheet size: {sys.getsizeof(sheet)}")
57 796.8 MiB 0.0 MiB 1 data = sheet.iter_rows()
58 796.8 MiB 0.0 MiB 1 header = next(data)
59 860.0 MiB 63.2 MiB 1 df = pd.DataFrame(data, columns=header)
60 860.0 MiB 0.0 MiB 1 print(f"df size: {sys.getsizeof(df)}")
61 # 获取列信息(表头)
62 860.0 MiB 0.0 MiB 1 column_info[sheet_name] = self._get_table_header(df)
63 853.2 MiB -6.8 MiB 1 del sheet, df
64 853.2 MiB 0.0 MiB 1 break # 如果只需要处理一个工作表,使用break退出循环
65 853.2 MiB 0.0 MiB 1 workbook.close()
66 853.2 MiB 0.0 MiB 1 del workbook
67
68 853.2 MiB 0.0 MiB 1 return StructureInfo(sheets=sheets, column_info=column_info)
ps. source is io.BytesIO
dimastbk and pablo-voltaiq
Metadata
Metadata
Assignees
Labels
No labels