-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathreader.py
More file actions
199 lines (146 loc) · 6.57 KB
/
reader.py
File metadata and controls
199 lines (146 loc) · 6.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
__author__="Thomas Mayer"
__date__="2014-04-22"
import collections
import settings
# commented out for the time being
try:
from scipy.sparse import lil_matrix, csc_matrix, coo_matrix
from scipy.io import mmread, mmwrite
import scipy.special
import numpy as np
import pandas as pd
except:
pass
import re
import os
class ParText():
"""Reads a file in the BibleText format and provides several ways to access the text.
Parameters:
=============
filename: name of the file to be read (first three letters must indicate ISO 639-3 code)
commentmarker: character that marks a comment line
separator: character that separates Bible IDs from the actual text
enc: encoding of the file (default: UTF-8)
"""
def __init__(self,filename,commentmarker="#",sep="\t",enc="utf-8",portions=range(0,67)):
self.iso = filename[:3]
self.filename = filename
# get shortcuts
if re.match('^[a-zA-Z]{3}$',filename):
bible_files = [f for f in os.listdir(settings._data_dir) if not f.startswith('.')]
iso_by_bible = {f[:3]:f for f in bible_files}
filename = iso_by_bible[filename]
# open file
fh = open(settings._data_dir + filename,'r',encoding=enc).readlines()
# clean up all punctuation marks TODO: find a better method to remove all non-letters!
pat = re.compile("[“”‘’`´“”‘’`´‚<>.;,:?¿‹›!()\[\]—\"„§$%&\/\=_{}]")
fh = "\t\t".join(fh)
fh = re.sub(pat,'',fh)
fh = fh.split("\t\t")
# collect all verses
self.verses = [(int(items[0].strip()),items[1].strip().lower().split()) for line in fh
for items in [line.split(sep,1)]
if not line.strip().startswith(commentmarker)
if int(line.strip()[:2]) in portions]
self.versedict = {v[0]:v[1] for v in self.verses}
def __getitem__(self,id):
"""Returns the text of the verse given by the verse id.
"""
return self.versedict[id]
def __len__(self):
"""Returns the length of the parallel text in number of verses.
"""
return len(self.verses)
def get_verses(self,format="dict"):
"""Returns the verses of the parallel text
Parameters:
==============
format: either as a list of tuples [format='tuple'] (40001001,"bla...bla")
or as a dictionary [format='dict'] {400010001: "bla...bla"}
"""
if format == "tuple":
return self.verses
else:
return dict(self.verses)
def get_verses_strings(self):
"""
Returns a list of tuples with verse ID and verse text as string (ID,text).
"""
return [(v[0],' '.join(v[1])) for v in self.verses]
def get_lexicon(self):
"""Returns the wordforms of the text together with the information in which verses they
occur.
"""
lex = collections.defaultdict(set)
for id,verse in self.verses:
for word in verse:
lex[word].add(id)
for l in lex:
lex[l] = sorted(list(lex[l]))
return lex
def get_wordforms(self,format="types"):
"""Returns the wordforms (types or tokens) of the parallel text.
Parameters:
===========
format: either as a dict of types [format='types'] (with frequency as value)
or a list of tokens [format='tokens']
"""
# collect all wordforms (types and tokens)
self.wordforms = collections.defaultdict(int)
for id,verse in self.verses:
for word in verse:
if word.strip() != '': self.wordforms[word] += 1
if format == "tokens":
return sorted(self.wordforms.keys())
else:
return self.wordforms
def wordforms_verses_count(self):
"""Returns a two-dimensional dictionary of wordforms and verses and how often the
wordform occurs in the verse."""
wordforms_verses_counter = collections.defaultdict(lambda: collections.defaultdict(int))
wordforms_by_verses = self.wordforms_verses()
for wordform in wordforms_by_verses:
for verse in wordforms_by_verses[wordform]:
wordforms_verses_counter[wordform][verse] += 1
return wordforms_verses_counter
def wordforms_verses(self):
"""Returns a dictionary of wordforms in which verses they occur.
"""
wordforms_by_verses = collections.defaultdict(list)
for id,verse in self.verses:
for word in verse:
wordforms_by_verses[word.lower()].append(id)
return wordforms_by_verses
def substrings_wordforms(self):
"""Returns a dictionary of substrings in which wordforms they occur.
"""
substrings_by_wordforms = collections.defaultdict(set)
wordforms = self.get_wordforms()
for word in wordforms:
word = word.lower()
for i in range(len(word)+1):
for j in range(i+1,len(word)+1):
substrings_by_wordforms[word[i:j]].add(word)
return substrings_by_wordforms
def get_verseids(self):
"""Returns the verse Ids for this parallel text."""
return sorted([v[0] for v in self.verses])
def get_matrix(self):
"""Returns a sparse matrix with verse IDs as row names and words as column names where
each cell indicates how many times the word occurs in the respective verse."""
wordforms = self.get_wordforms(format="tokens")
rowdata = list()
coldata = list()
data = list()
wordforms_by_number = {w: i for i,w in enumerate(wordforms)}
wfcounter = 0
for id,verse in self.verses:
for word in verse:
#words_by_verses[word].append(id)
rowdata.append(wordforms_by_number[word])
coldata.append(id)
data.append(1)
sparse = coo_matrix((data,(rowdata,coldata)),dtype="int16",shape=(len(wordforms),99999999))
return sparse,wordforms,wordforms_by_number
if __name__ == "__main__":
pass