Skip to content

Commit 439d003

Browse files
committed
Adding new version of chardet
1 parent d424d4c commit 439d003

39 files changed

+1510
-1159
lines changed

thirdparty/chardet/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,28 @@
33
# modify it under the terms of the GNU Lesser General Public
44
# License as published by the Free Software Foundation; either
55
# version 2.1 of the License, or (at your option) any later version.
6-
#
6+
#
77
# This library is distributed in the hope that it will be useful,
88
# but WITHOUT ANY WARRANTY; without even the implied warranty of
99
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1010
# Lesser General Public License for more details.
11-
#
11+
#
1212
# You should have received a copy of the GNU Lesser General Public
1313
# License along with this library; if not, write to the Free Software
1414
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
1515
# 02110-1301 USA
1616
######################### END LICENSE BLOCK #########################
1717

18-
__version__ = "2.0.1"
18+
__version__ = "2.3.0"
19+
from sys import version_info
20+
1921

2022
def detect(aBuf):
21-
import universaldetector
23+
if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
24+
(version_info >= (3, 0) and not isinstance(aBuf, bytes))):
25+
raise ValueError('Expected a bytes object, not a unicode object')
26+
27+
from . import universaldetector
2228
u = universaldetector.UniversalDetector()
2329
u.reset()
2430
u.feed(aBuf)

thirdparty/chardet/big5freq.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,51 @@
11
######################## BEGIN LICENSE BLOCK ########################
22
# The Original Code is Mozilla Communicator client code.
3-
#
3+
#
44
# The Initial Developer of the Original Code is
55
# Netscape Communications Corporation.
66
# Portions created by the Initial Developer are Copyright (C) 1998
77
# the Initial Developer. All Rights Reserved.
8-
#
8+
#
99
# Contributor(s):
1010
# Mark Pilgrim - port to Python
1111
#
1212
# This library is free software; you can redistribute it and/or
1313
# modify it under the terms of the GNU Lesser General Public
1414
# License as published by the Free Software Foundation; either
1515
# version 2.1 of the License, or (at your option) any later version.
16-
#
16+
#
1717
# This library is distributed in the hope that it will be useful,
1818
# but WITHOUT ANY WARRANTY; without even the implied warranty of
1919
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2020
# Lesser General Public License for more details.
21-
#
21+
#
2222
# You should have received a copy of the GNU Lesser General Public
2323
# License along with this library; if not, write to the Free Software
2424
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
2525
# 02110-1301 USA
2626
######################### END LICENSE BLOCK #########################
2727

2828
# Big5 frequency table
29-
# by Taiwan's Mandarin Promotion Council
29+
# by Taiwan's Mandarin Promotion Council
3030
# <http://www.edu.tw:81/mandr/>
31-
#
31+
#
3232
# 128 --> 0.42261
3333
# 256 --> 0.57851
3434
# 512 --> 0.74851
3535
# 1024 --> 0.89384
3636
# 2048 --> 0.97583
37-
#
37+
#
3838
# Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98
3939
# Random Distribution Ration = 512/(5401-512)=0.105
40-
#
40+
#
4141
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
4242

4343
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
4444

4545
#Char to FreqOrder table
4646
BIG5_TABLE_SIZE = 5376
4747

48-
Big5CharToFreqOrder = ( \
48+
Big5CharToFreqOrder = (
4949
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
5050
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
5151
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
@@ -921,3 +921,5 @@
921921
13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
922922
13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
923923
13968,13969,13970,13971,13972) #13973
924+
925+
# flake8: noqa

thirdparty/chardet/big5prober.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,35 @@
11
######################## BEGIN LICENSE BLOCK ########################
22
# The Original Code is Mozilla Communicator client code.
3-
#
3+
#
44
# The Initial Developer of the Original Code is
55
# Netscape Communications Corporation.
66
# Portions created by the Initial Developer are Copyright (C) 1998
77
# the Initial Developer. All Rights Reserved.
8-
#
8+
#
99
# Contributor(s):
1010
# Mark Pilgrim - port to Python
1111
#
1212
# This library is free software; you can redistribute it and/or
1313
# modify it under the terms of the GNU Lesser General Public
1414
# License as published by the Free Software Foundation; either
1515
# version 2.1 of the License, or (at your option) any later version.
16-
#
16+
#
1717
# This library is distributed in the hope that it will be useful,
1818
# but WITHOUT ANY WARRANTY; without even the implied warranty of
1919
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2020
# Lesser General Public License for more details.
21-
#
21+
#
2222
# You should have received a copy of the GNU Lesser General Public
2323
# License along with this library; if not, write to the Free Software
2424
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
2525
# 02110-1301 USA
2626
######################### END LICENSE BLOCK #########################
2727

28-
from mbcharsetprober import MultiByteCharSetProber
29-
from codingstatemachine import CodingStateMachine
30-
from chardistribution import Big5DistributionAnalysis
31-
from mbcssm import Big5SMModel
28+
from .mbcharsetprober import MultiByteCharSetProber
29+
from .codingstatemachine import CodingStateMachine
30+
from .chardistribution import Big5DistributionAnalysis
31+
from .mbcssm import Big5SMModel
32+
3233

3334
class Big5Prober(MultiByteCharSetProber):
3435
def __init__(self):

thirdparty/chardet/chardetect.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env python
2+
"""
3+
Script which takes one or more file paths and reports on their detected
4+
encodings
5+
6+
Example::
7+
8+
% chardetect somefile someotherfile
9+
somefile: windows-1252 with confidence 0.5
10+
someotherfile: ascii with confidence 1.0
11+
12+
If no paths are provided, it takes its input from stdin.
13+
14+
"""
15+
16+
from __future__ import absolute_import, print_function, unicode_literals
17+
18+
import argparse
19+
import sys
20+
from io import open
21+
22+
from chardet import __version__
23+
from chardet.universaldetector import UniversalDetector
24+
25+
26+
def description_of(lines, name='stdin'):
27+
"""
28+
Return a string describing the probable encoding of a file or
29+
list of strings.
30+
31+
:param lines: The lines to get the encoding of.
32+
:type lines: Iterable of bytes
33+
:param name: Name of file or collection of lines
34+
:type name: str
35+
"""
36+
u = UniversalDetector()
37+
for line in lines:
38+
u.feed(line)
39+
u.close()
40+
result = u.result
41+
if result['encoding']:
42+
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
43+
result['confidence'])
44+
else:
45+
return '{0}: no result'.format(name)
46+
47+
48+
def main(argv=None):
49+
'''
50+
Handles command line arguments and gets things started.
51+
52+
:param argv: List of arguments, as if specified on the command-line.
53+
If None, ``sys.argv[1:]`` is used instead.
54+
:type argv: list of str
55+
'''
56+
# Get command line arguments
57+
parser = argparse.ArgumentParser(
58+
description="Takes one or more file paths and reports their detected \
59+
encodings",
60+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
61+
conflict_handler='resolve')
62+
parser.add_argument('input',
63+
help='File whose encoding we would like to determine.',
64+
type=argparse.FileType('rb'), nargs='*',
65+
default=[sys.stdin])
66+
parser.add_argument('--version', action='version',
67+
version='%(prog)s {0}'.format(__version__))
68+
args = parser.parse_args(argv)
69+
70+
for f in args.input:
71+
if f.isatty():
72+
print("You are running chardetect interactively. Press " +
73+
"CTRL-D twice at the start of a blank line to signal the " +
74+
"end of your input. If you want help, run chardetect " +
75+
"--help\n", file=sys.stderr)
76+
print(description_of(f, f.name))
77+
78+
79+
if __name__ == '__main__':
80+
main()

0 commit comments

Comments
 (0)