forked from buriy/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding.py
More file actions
27 lines (23 loc) · 657 Bytes
/
Copy pathencoding.py
File metadata and controls
27 lines (23 loc) · 657 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import logging
import re
import chardet
LOG = logging.getLogger()
def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess
try:
diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text)
# 99% of utf-8
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return enc
except UnicodeDecodeError:
pass
res = chardet.detect(text)
enc = res['encoding']
# print '->', enc, "%.2f" % res['confidence']
if enc == 'MacCyrillic':
enc = 'cp1251'
return enc