Skip to content

Commit 674e2d0

Browse files
Issue python#15068: Got rid of excessive buffering in fileinput.
The bufsize parameter is now deprecated and ignored.
2 parents 238fecd + cc2dbc5 commit 674e2d0

File tree

4 files changed

+160
-95
lines changed

4 files changed

+160
-95
lines changed

Doc/library/fileinput.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ The following function is the primary interface of this module:
7171
.. versionchanged:: 3.2
7272
Can be used as a context manager.
7373

74+
.. deprecated-removed:: 3.6 3.8
75+
The *bufsize* parameter.
7476

7577
The following functions use the global state created by :func:`fileinput.input`;
7678
if there is no active state, :exc:`RuntimeError` is raised.
@@ -161,7 +163,10 @@ available for subclassing as well:
161163
Can be used as a context manager.
162164

163165
.. deprecated:: 3.4
164-
The ``'rU'`` and ``'U'`` modes.
166+
The ``'rU'`` and ``'U'`` modes.
167+
168+
.. deprecated-removed:: 3.6 3.8
169+
The *bufsize* parameter.
165170

166171

167172
**Optional in-place filtering:** if the keyword argument ``inplace=True`` is

Lib/fileinput.py

Lines changed: 76 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,6 @@
6464
disabled when standard input is read. XXX The current implementation
6565
does not work for MS-DOS 8+3 filesystems.
6666
67-
Performance: this module is unfortunately one of the slower ways of
68-
processing large numbers of input lines. Nevertheless, a significant
69-
speed-up has been obtained by using readlines(bufsize) instead of
70-
readline(). A new keyword argument, bufsize=N, is present on the
71-
input() function and the FileInput() class to override the default
72-
buffer size.
73-
7467
XXX Possible additions:
7568
7669
- optional getopt argument processing
@@ -87,8 +80,6 @@
8780

8881
_state = None
8982

90-
DEFAULT_BUFSIZE = 8*1024
91-
9283
def input(files=None, inplace=False, backup="", bufsize=0,
9384
mode="r", openhook=None):
9485
"""Return an instance of the FileInput class, which can be iterated.
@@ -208,17 +199,19 @@ def __init__(self, files=None, inplace=False, backup="", bufsize=0,
208199
self._files = files
209200
self._inplace = inplace
210201
self._backup = backup
211-
self._bufsize = bufsize or DEFAULT_BUFSIZE
202+
if bufsize:
203+
import warnings
204+
warnings.warn('bufsize is deprecated and ignored',
205+
DeprecationWarning, stacklevel=2)
212206
self._savestdout = None
213207
self._output = None
214208
self._filename = None
215-
self._lineno = 0
209+
self._startlineno = 0
216210
self._filelineno = 0
217211
self._file = None
212+
self._readline = self._start_readline
218213
self._isstdin = False
219214
self._backupfilename = None
220-
self._buffer = []
221-
self._bufindex = 0
222215
# restrict mode argument to reading modes
223216
if mode not in ('r', 'rU', 'U', 'rb'):
224217
raise ValueError("FileInput opening mode must be one of "
@@ -254,22 +247,18 @@ def __iter__(self):
254247
return self
255248

256249
def __next__(self):
257-
try:
258-
line = self._buffer[self._bufindex]
259-
except IndexError:
260-
pass
261-
else:
262-
self._bufindex += 1
263-
self._lineno += 1
250+
line = self._readline()
251+
if line:
264252
self._filelineno += 1
265253
return line
266-
line = self.readline()
267-
if not line:
254+
if not self._file:
268255
raise StopIteration
269-
return line
256+
self.nextfile()
257+
# Recursive call
258+
return self.__next__()
270259

271260
def __getitem__(self, i):
272-
if i != self._lineno:
261+
if i != self.lineno():
273262
raise RuntimeError("accessing lines out of order")
274263
try:
275264
return self.__next__()
@@ -290,6 +279,7 @@ def nextfile(self):
290279
finally:
291280
file = self._file
292281
self._file = None
282+
self._readline = self._start_readline
293283
try:
294284
if file and not self._isstdin:
295285
file.close()
@@ -301,85 +291,81 @@ def nextfile(self):
301291
except OSError: pass
302292

303293
self._isstdin = False
304-
self._buffer = []
305-
self._bufindex = 0
306294

307295
def readline(self):
308-
try:
309-
line = self._buffer[self._bufindex]
310-
except IndexError:
311-
pass
296+
while True:
297+
line = self._readline()
298+
if line:
299+
self._filelineno += 1
300+
return line
301+
if not self._file:
302+
return line
303+
self.nextfile()
304+
# repeat with next file
305+
306+
def _start_readline(self):
307+
if not self._files:
308+
if 'b' in self._mode:
309+
return b''
310+
else:
311+
return ''
312+
self._filename = self._files[0]
313+
self._files = self._files[1:]
314+
self._startlineno = self.lineno()
315+
self._filelineno = 0
316+
self._file = None
317+
self._isstdin = False
318+
self._backupfilename = 0
319+
if self._filename == '-':
320+
self._filename = '<stdin>'
321+
if 'b' in self._mode:
322+
self._file = getattr(sys.stdin, 'buffer', sys.stdin)
323+
else:
324+
self._file = sys.stdin
325+
self._isstdin = True
312326
else:
313-
self._bufindex += 1
314-
self._lineno += 1
315-
self._filelineno += 1
316-
return line
317-
if not self._file:
318-
if not self._files:
319-
if 'b' in self._mode:
320-
return b''
327+
if self._inplace:
328+
self._backupfilename = (
329+
self._filename + (self._backup or ".bak"))
330+
try:
331+
os.unlink(self._backupfilename)
332+
except OSError:
333+
pass
334+
# The next few lines may raise OSError
335+
os.rename(self._filename, self._backupfilename)
336+
self._file = open(self._backupfilename, self._mode)
337+
try:
338+
perm = os.fstat(self._file.fileno()).st_mode
339+
except OSError:
340+
self._output = open(self._filename, "w")
321341
else:
322-
return ''
323-
self._filename = self._files[0]
324-
self._files = self._files[1:]
325-
self._filelineno = 0
326-
self._file = None
327-
self._isstdin = False
328-
self._backupfilename = 0
329-
if self._filename == '-':
330-
self._filename = '<stdin>'
331-
if 'b' in self._mode:
332-
self._file = getattr(sys.stdin, 'buffer', sys.stdin)
333-
else:
334-
self._file = sys.stdin
335-
self._isstdin = True
336-
else:
337-
if self._inplace:
338-
self._backupfilename = (
339-
self._filename + (self._backup or ".bak"))
342+
mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
343+
if hasattr(os, 'O_BINARY'):
344+
mode |= os.O_BINARY
345+
346+
fd = os.open(self._filename, mode, perm)
347+
self._output = os.fdopen(fd, "w")
340348
try:
341-
os.unlink(self._backupfilename)
349+
if hasattr(os, 'chmod'):
350+
os.chmod(self._filename, perm)
342351
except OSError:
343352
pass
344-
# The next few lines may raise OSError
345-
os.rename(self._filename, self._backupfilename)
346-
self._file = open(self._backupfilename, self._mode)
347-
try:
348-
perm = os.fstat(self._file.fileno()).st_mode
349-
except OSError:
350-
self._output = open(self._filename, "w")
351-
else:
352-
mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
353-
if hasattr(os, 'O_BINARY'):
354-
mode |= os.O_BINARY
355-
356-
fd = os.open(self._filename, mode, perm)
357-
self._output = os.fdopen(fd, "w")
358-
try:
359-
if hasattr(os, 'chmod'):
360-
os.chmod(self._filename, perm)
361-
except OSError:
362-
pass
363-
self._savestdout = sys.stdout
364-
sys.stdout = self._output
353+
self._savestdout = sys.stdout
354+
sys.stdout = self._output
355+
else:
356+
# This may raise OSError
357+
if self._openhook:
358+
self._file = self._openhook(self._filename, self._mode)
365359
else:
366-
# This may raise OSError
367-
if self._openhook:
368-
self._file = self._openhook(self._filename, self._mode)
369-
else:
370-
self._file = open(self._filename, self._mode)
371-
self._buffer = self._file.readlines(self._bufsize)
372-
self._bufindex = 0
373-
if not self._buffer:
374-
self.nextfile()
375-
# Recursive call
376-
return self.readline()
360+
self._file = open(self._filename, self._mode)
361+
self._readline = self._file.readline
362+
return self._readline()
377363

378364
def filename(self):
379365
return self._filename
380366

381367
def lineno(self):
382-
return self._lineno
368+
return self._startlineno + self._filelineno
383369

384370
def filelineno(self):
385371
return self._filelineno

Lib/test/test_fileinput.py

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,42 @@ def remove_tempfiles(*names):
4747
if name:
4848
safe_unlink(name)
4949

50+
class LineReader:
51+
52+
def __init__(self):
53+
self._linesread = []
54+
55+
@property
56+
def linesread(self):
57+
try:
58+
return self._linesread[:]
59+
finally:
60+
self._linesread = []
61+
62+
def openhook(self, filename, mode):
63+
self.it = iter(filename.splitlines(True))
64+
return self
65+
66+
def readline(self, size=None):
67+
line = next(self.it, '')
68+
self._linesread.append(line)
69+
return line
70+
71+
def readlines(self, hint=-1):
72+
lines = []
73+
size = 0
74+
while True:
75+
line = self.readline()
76+
if not line:
77+
return lines
78+
lines.append(line)
79+
size += len(line)
80+
if size >= hint:
81+
return lines
82+
83+
def close(self):
84+
pass
85+
5086
class BufferSizesTests(unittest.TestCase):
5187
def test_buffer_sizes(self):
5288
# First, run the tests with default and teeny buffer size.
@@ -57,7 +93,11 @@ def test_buffer_sizes(self):
5793
t2 = writeTmp(2, ["Line %s of file 2\n" % (i+1) for i in range(10)])
5894
t3 = writeTmp(3, ["Line %s of file 3\n" % (i+1) for i in range(5)])
5995
t4 = writeTmp(4, ["Line %s of file 4\n" % (i+1) for i in range(1)])
60-
self.buffer_size_test(t1, t2, t3, t4, bs, round)
96+
if bs:
97+
with self.assertWarns(DeprecationWarning):
98+
self.buffer_size_test(t1, t2, t3, t4, bs, round)
99+
else:
100+
self.buffer_size_test(t1, t2, t3, t4, bs, round)
61101
finally:
62102
remove_tempfiles(t1, t2, t3, t4)
63103

@@ -290,7 +330,7 @@ def test_readline(self):
290330
self.addCleanup(safe_unlink, TESTFN)
291331

292332
with FileInput(files=TESTFN,
293-
openhook=hook_encoded('ascii'), bufsize=8) as fi:
333+
openhook=hook_encoded('ascii')) as fi:
294334
try:
295335
self.assertEqual(fi.readline(), 'A\n')
296336
self.assertEqual(fi.readline(), 'B\n')
@@ -458,6 +498,38 @@ def fileno(self):
458498

459499
self.assertEqual(result, -1, "fileno() should return -1")
460500

501+
def test_readline_buffering(self):
502+
src = LineReader()
503+
with FileInput(files=['line1\nline2', 'line3\n'],
504+
openhook=src.openhook) as fi:
505+
self.assertEqual(src.linesread, [])
506+
self.assertEqual(fi.readline(), 'line1\n')
507+
self.assertEqual(src.linesread, ['line1\n'])
508+
self.assertEqual(fi.readline(), 'line2')
509+
self.assertEqual(src.linesread, ['line2'])
510+
self.assertEqual(fi.readline(), 'line3\n')
511+
self.assertEqual(src.linesread, ['', 'line3\n'])
512+
self.assertEqual(fi.readline(), '')
513+
self.assertEqual(src.linesread, [''])
514+
self.assertEqual(fi.readline(), '')
515+
self.assertEqual(src.linesread, [])
516+
517+
def test_iteration_buffering(self):
518+
src = LineReader()
519+
with FileInput(files=['line1\nline2', 'line3\n'],
520+
openhook=src.openhook) as fi:
521+
self.assertEqual(src.linesread, [])
522+
self.assertEqual(next(fi), 'line1\n')
523+
self.assertEqual(src.linesread, ['line1\n'])
524+
self.assertEqual(next(fi), 'line2')
525+
self.assertEqual(src.linesread, ['line2'])
526+
self.assertEqual(next(fi), 'line3\n')
527+
self.assertEqual(src.linesread, ['', 'line3\n'])
528+
self.assertRaises(StopIteration, next, fi)
529+
self.assertEqual(src.linesread, [''])
530+
self.assertRaises(StopIteration, next, fi)
531+
self.assertEqual(src.linesread, [])
532+
461533
class MockFileInput:
462534
"""A class that mocks out fileinput.FileInput for use during unit tests"""
463535

@@ -917,8 +989,7 @@ def check(mode, expected_lines):
917989
class MiscTest(unittest.TestCase):
918990

919991
def test_all(self):
920-
blacklist = {'DEFAULT_BUFSIZE'}
921-
support.check__all__(self, fileinput, blacklist=blacklist)
992+
support.check__all__(self, fileinput)
922993

923994

924995
if __name__ == "__main__":

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ Core and Builtins
201201
Library
202202
-------
203203

204+
- Issue #15068: Got rid of excessive buffering in fileinput.
205+
The bufsize parameter is now deprecated and ignored.
206+
204207
- Issue #19475: Added an optional argument timespec to the datetime
205208
isoformat() method to choose the precision of the time component.
206209

0 commit comments

Comments
 (0)