To handle this new line, I used the pyparsing Optional class to mark the appname part as optional, and split out the trailing ':'. In the code below, I also did a few tweaks, some parse actions for parse-time data conversion, and some results names to simplify creating the resulting dict in the parse() method.
from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex, Optional
from datetime import datetime
class Parser(object):
# log lines don't include the year, but if we don't provide one, datetime.strptime will assume 1900
ASSUMED_YEAR = '2016'
def __init__(self):
ints = Word(nums)
# priority
# priority = Suppress("<") + ints + Suppress(">")
# timestamp
month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
day = ints
hour = Combine(ints + ":" + ints + ":" + ints)
timestamp = month + day + hour
# a parse action will convert this timestamp to a datetime
timestamp.setParseAction(lambda t: datetime.strptime(Parser.ASSUMED_YEAR + ' ' + ' '.join(t), '%Y %b %d %H:%M:%S'))
# hostname
hostname = Word(alphas + nums + "_-.")
# appname
appname = Word(alphas + "/-_.()")("appname") + (Suppress("[") + ints("pid") + Suppress("]")) | (Word(alphas + "/-_.")("appname"))
appname.setName("appname")
# message
message = Regex(".*")
# pattern build
# (add results names to make it easier to access parsed fields)
self._pattern = timestamp("timestamp") + hostname("hostname") + Optional(appname) + Suppress(':') + message("message")
def parse(self, line):
parsed = self._pattern.parseString(line)
# fill in keys that might not have been found in the input string
# (this could have been done in a parse action too, then this method would
# have just been a two-liner)
for key in 'appname pid'.split():
if key not in parsed:
parsed[key] = ''
return parsed.asDict()
Use runTests() to test your parser against specific test inputs:
pattern = Parser()._pattern
tests = """\
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar 7 21:23:22 avas dccifd[6191]: missing message body
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar 8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar 8 15:18:40 avas: last message repeated 11 times"""
pattern.runTests(tests)
Gives:
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
[datetime.datetime(2016, 3, 7, 4, 2, 16), 'avas', 'clamd', '11165', '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND'
- pid: '11165'
- timestamp: datetime.datetime(2016, 3, 7, 4, 2, 16)
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
[datetime.datetime(2016, 3, 7, 4, 5, 55), 'avas', 'clamd', '11240', '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND'
- pid: '11240'
- timestamp: datetime.datetime(2016, 3, 7, 4, 5, 55)
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
[datetime.datetime(2016, 3, 7, 9, 0, 51), 'avas', 'clamd', '27173', 'SelfCheck: Database status OK.']
- appname: 'clamd'
- hostname: 'avas'
- message: 'SelfCheck: Database status OK.'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 9, 0, 51)
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
[datetime.datetime(2016, 3, 7, 5, 59, 2), 'avas', 'clamd', '27173', 'Database correctly reloaded (20400 viruses)']
- appname: 'clamd'
- hostname: 'avas'
- message: 'Database correctly reloaded (20400 viruses)'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 5, 59, 2)
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
[datetime.datetime(2016, 3, 7, 11, 14, 35), 'avas', 'dccd', '13284', '21 requests/sec are too many from anonymous 205.201.1.56,2246']
- appname: 'dccd'
- hostname: 'avas'
- message: '21 requests/sec are too many from anonymous 205.201.1.56,2246'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 7, 11, 14, 35)
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
[datetime.datetime(2016, 3, 8, 0, 22, 57), 'avas', 'dccifd', '9933', 'write(MTA socket,4): Broken pipe']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'write(MTA socket,4): Broken pipe'
- pid: '9933'
- timestamp: datetime.datetime(2016, 3, 8, 0, 22, 57)
Mar 7 21:23:22 avas dccifd[6191]: missing message body
[datetime.datetime(2016, 3, 7, 21, 23, 22), 'avas', 'dccifd', '6191', 'missing message body']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'missing message body'
- pid: '6191'
- timestamp: datetime.datetime(2016, 3, 7, 21, 23, 22)
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
[datetime.datetime(2016, 3, 9, 16, 5, 17), 'avas', 'named', '12045', 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53'
- pid: '12045'
- timestamp: datetime.datetime(2016, 3, 9, 16, 5, 17)
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
[datetime.datetime(2016, 3, 10, 0, 38, 16), 'avas', 'dccifd', '23069', 'continue not asking DCC 17 seconds after failure']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'continue not asking DCC 17 seconds after failure'
- pid: '23069'
- timestamp: datetime.datetime(2016, 3, 10, 0, 38, 16)
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
[datetime.datetime(2016, 3, 10, 9, 42, 11), 'avas', 'named', 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT'
- timestamp: datetime.datetime(2016, 3, 10, 9, 42, 11)
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
[datetime.datetime(2016, 3, 9, 3, 48, 7), 'avas', 'dccd', '145', 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`']
- appname: 'dccd'
- hostname: 'avas'
- message: 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`'
- pid: '145'
- timestamp: datetime.datetime(2016, 3, 9, 3, 48, 7)
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
[datetime.datetime(2016, 3, 9, 11, 58, 18), 'avas', 'kernel', 'i810_audio: Connection 0 with codec id 2']
- appname: 'kernel'
- hostname: 'avas'
- message: 'i810_audio: Connection 0 with codec id 2'
- timestamp: datetime.datetime(2016, 3, 9, 11, 58, 18)
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
[datetime.datetime(2016, 3, 9, 19, 41, 13), 'avas', 'dccd', '3004', '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577']
- appname: 'dccd'
- hostname: 'avas'
- message: '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577'
- pid: '3004'
- timestamp: datetime.datetime(2016, 3, 9, 19, 41, 13)
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
[datetime.datetime(2016, 3, 8, 9, 1, 7), 'avas', 'sshd(pam_unix)', '21839', 'session opened for user tom by (uid=35567)']
- appname: 'sshd(pam_unix)'
- hostname: 'avas'
- message: 'session opened for user tom by (uid=35567)'
- pid: '21839'
- timestamp: datetime.datetime(2016, 3, 8, 9, 1, 7)
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
[datetime.datetime(2016, 3, 8, 3, 52, 4), 'avas', 'dccd', '13284', '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window']
- appname: 'dccd'
- hostname: 'avas'
- message: '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 8, 3, 52, 4)
Mar 8 16:05:26 avas arpwatch: listening on eth0
[datetime.datetime(2016, 3, 8, 16, 5, 26), 'avas', 'arpwatch', 'listening on eth0']
- appname: 'arpwatch'
- hostname: 'avas'
- message: 'listening on eth0'
- timestamp: datetime.datetime(2016, 3, 8, 16, 5, 26)
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
[datetime.datetime(2016, 3, 10, 10, 0, 6), 'avas', 'named', '6986', 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 6)
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
[datetime.datetime(2016, 3, 10, 10, 0, 10), 'avas', 'named', '6986', 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 10)
Mar 8 15:18:40 avas: last message repeated 11 times
[datetime.datetime(2016, 3, 8, 15, 18, 40), 'avas', 'last message repeated 11 times']
- hostname: 'avas'
- message: 'last message repeated 11 times'
- timestamp: datetime.datetime(2016, 3, 8, 15, 18, 40)
Or using the parse() method of the Parser class:
from pprint import pprint
for t in tests.splitlines():
pprint(Parser().parse(t))
print()
gives:
{'appname': 'clamd',
'hostname': 'avas',
'message': '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: '
'Worm.Mydoom.F FOUND ',
'pid': '11165',
'timestamp': datetime.datetime(2016, 3, 7, 4, 2, 16)}
{'appname': 'clamd',
'hostname': 'avas',
'message': '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: '
'Worm.SomeFool.Gen-1 FOUND ',
'pid': '11240',
'timestamp': datetime.datetime(2016, 3, 7, 4, 5, 55)}
{'appname': 'clamd',
'hostname': 'avas',
'message': 'SelfCheck: Database status OK.',
'pid': '27173',
'timestamp': datetime.datetime(2016, 3, 7, 9, 0, 51)}
{'appname': 'clamd',
'hostname': 'avas',
'message': 'Database correctly reloaded (20400 viruses) ',
'pid': '27173',
'timestamp': datetime.datetime(2016, 3, 7, 5, 59, 2)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '21 requests/sec are too many from anonymous 205.201.1.56,2246',
'pid': '13284',
'timestamp': datetime.datetime(2016, 3, 7, 11, 14, 35)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'write(MTA socket,4): Broken pipe',
'pid': '9933',
'timestamp': datetime.datetime(2016, 3, 8, 0, 22, 57)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'missing message body',
'pid': '6191',
'timestamp': datetime.datetime(2016, 3, 7, 21, 23, 22)}
{'appname': 'named',
'hostname': 'avas',
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
'10.0.0.253#53',
'pid': '12045',
'timestamp': datetime.datetime(2016, 3, 9, 16, 5, 17)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'continue not asking DCC 17 seconds after failure',
'pid': '23069',
'timestamp': datetime.datetime(2016, 3, 10, 0, 38, 16)}
{'appname': 'named',
'hostname': 'avas',
'message': 'client 127.0.0.1#55524: query: '
'23.68.27.142.sa-trusted.bondedsender.org IN TXT',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 10, 9, 42, 11)}
{'appname': 'dccd',
'hostname': 'avas',
'message': 'automatic dbclean; starting `dbclean -DPq -i 1189 -L '
'info,local5.notice -L error,local5.err`',
'pid': '145',
'timestamp': datetime.datetime(2016, 3, 9, 3, 48, 7)}
{'appname': 'kernel',
'hostname': 'avas',
'message': 'i810_audio: Connection 0 with codec id 2',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 9, 11, 58, 18)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '"packet length 44 too small for REPORT" sent to client 1 at '
'194.63.250.215,47577',
'pid': '3004',
'timestamp': datetime.datetime(2016, 3, 9, 19, 41, 13)}
{'appname': 'sshd(pam_unix)',
'hostname': 'avas',
'message': 'session opened for user tom by (uid=35567)',
'pid': '21839',
'timestamp': datetime.datetime(2016, 3, 8, 9, 1, 7)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window',
'pid': '13284',
'timestamp': datetime.datetime(2016, 3, 8, 3, 52, 4)}
{'appname': 'arpwatch',
'hostname': 'avas',
'message': 'listening on eth0',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 8, 16, 5, 26)}
{'appname': 'named',
'hostname': 'avas',
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
'192.75.26.21#53',
'pid': '6986',
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 6)}
{'appname': 'named',
'hostname': 'avas',
'message': 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX',
'pid': '6986',
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 10)}
{'appname': '',
'hostname': 'avas',
'message': 'last message repeated 11 times',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 8, 15, 18, 40)}