File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -25,7 +25,8 @@ define ([
2525
2626 const PDF_IMPORT = `import pandas as pd
2727import fitz
28- from nltk.tokenize import sent_tokenize` ;
28+ import nltk
29+ nltk.download('punkt')` ;
2930
3031 const PDF_FUNC = `def vp_pdf_get_sentence(fname_lst):
3132 '''
@@ -43,14 +44,15 @@ from nltk.tokenize import sent_tokenize`;
4344 text_lst = [block[4] for block in block_lst if block[6] == 0]
4445 text = '\\n'.join(text_lst)
4546
46- sentence_lst.extend([sentence for sentence in sent_tokenize(text)])
47+ sentence_lst.extend([sentence for sentence in nltk. sent_tokenize(text)])
4748
4849 doc.close()
49- except:
50+ except Exception as e:
51+ print(e)
5052 continue
5153
5254 df_doc = pd.DataFrame({
53- 'fname': fname,
55+ 'fname': fname.split('/')[-1] ,
5456 'sentence': sentence_lst
5557 })
5658 df = pd.concat([df,df_doc])
You can’t perform that action at this time.
0 commit comments