File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -160,7 +160,8 @@ define([
160160 'printCommand.py' ,
161161 'fileNaviCommand.py' ,
162162 'pandasCommand.py' ,
163- 'variableCommand.py'
163+ 'variableCommand.py' ,
164+ 'userCommand.py'
164165 ] ;
165166 let promiseList = [ ] ;
166167 libraryList . forEach ( libName => {
Original file line number Diff line number Diff line change @@ -30,37 +30,6 @@ import fitz
3030import nltk
3131nltk.download('punkt')` ;
3232
33- const PDF_FUNC = `def vp_pdf_get_sentence(fname_lst):
34- '''
35- Get sentence from pdf file by PyMuPDF
36- '''
37- df = pd.DataFrame()
38- for fname in fname_lst:
39- if fname.split('.')[-1] != 'pdf': continue
40- try:
41- doc = fitz.open(fname)
42- sentence_lst = []
43- for page in doc:
44- block_lst = page.get_text('blocks')
45-
46- text_lst = [block[4] for block in block_lst if block[6] == 0]
47- text = '\\n'.join(text_lst)
48-
49- sentence_lst.extend([sentence for sentence in nltk.sent_tokenize(text)])
50-
51- doc.close()
52- except Exception as e:
53- print(e)
54- continue
55-
56- df_doc = pd.DataFrame({
57- 'fname': fname.split('/')[-1],
58- 'sentence': sentence_lst
59- })
60- df = pd.concat([df,df_doc])
61-
62- return df.reset_index().drop('index', axis=1)` ;
63-
6433 const PDF_CMD = 'df = vp_pdf_get_sentence(pdf_lst)\ndf'
6534 /**
6635 * PDF
@@ -98,7 +67,6 @@ nltk.download('punkt')`;
9867 // click import
9968 $ ( this . wrapSelector ( '.vp-pdf-import-btn' ) ) . on ( 'click' , function ( ) {
10069 com_interface . insertCell ( 'code' , PDF_IMPORT ) ;
101- com_interface . insertCell ( 'code' , PDF_FUNC ) ;
10270 } ) ;
10371
10472 // click file navigation button
Original file line number Diff line number Diff line change 1+ import pandas as pd
2+ import numpy as np
3+ import fitz
4+ import nltk
5+ nltk .download ('punkt' )
6+
7+ def vp_pdf_get_sentence (fname_lst ):
8+ '''
9+ Get sentence from pdf file by PyMuPDF
10+ '''
11+ df = pd .DataFrame ()
12+ for fname in fname_lst :
13+ if fname .split ('.' )[- 1 ] != 'pdf' : continue
14+ try :
15+ doc = fitz .open (fname )
16+ sentence_lst = []
17+ for page in doc :
18+ block_lst = page .get_text ('blocks' )
19+
20+ text_lst = [block [4 ] for block in block_lst if block [6 ] == 0 ]
21+ text = '\\ n' .join (text_lst )
22+
23+ sentence_lst .extend ([sentence for sentence in nltk .sent_tokenize (text )])
24+
25+ doc .close ()
26+ except Exception as e :
27+ print (e )
28+ continue
29+
30+ df_doc = pd .DataFrame ({
31+ 'fname' : fname .split ('/' )[- 1 ],
32+ 'sentence' : sentence_lst
33+ })
34+ df = pd .concat ([df ,df_doc ])
35+
36+ return df .reset_index ().drop ('index' , axis = 1 )
37+
38+ def vp_drop_outlier (df , col , weight = 1.5 ):
39+ sr = df [col ]
40+
41+ q25 = np .percentile (sr .values , 25 )
42+ q75 = np .percentile (sr .values , 75 )
43+
44+ iqr = q75 - q25
45+ iqr_w = iqr * weight
46+
47+ val_l = q25 - iqr_w
48+ val_h = q75 + iqr_w
49+
50+ outlier_index = sr [(sr < val_l ) | (sr > val_h )].index
51+
52+ df_res = df .drop (outlier_index ).copy ()
53+
54+ return df_res
You can’t perform that action at this time.
0 commit comments