2

How can I parse log file which has logs in multiple formats using pyparsing module. Following is the code that I am working with.

# -*- coding: utf-8 -*-
"""

"""

import pandas as pd

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex

from time import strftime

class Parser(object):
  def __init__(self):
    ints = Word(nums)

    # priority
   # priority = Suppress("<") + ints + Suppress(">")

    # timestamp
    month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
    day   = ints
    hour  = Combine(ints + ":" + ints + ":" + ints)

    timestamp = month + day + hour

    # hostname
    hostname = Word(alphas + nums + "_" + "-" + ".")

    # appname
    appname = Word(alphas + "/" + "-" + "_" + "." + "(" + ")") + (Suppress("[") + ints + Suppress("]")) | (Word(alphas + "/" + "-" + "_" + ".")  + Word (":")) 

    # message
    message = Regex(".*")

    # pattern build
    self.__pattern = timestamp + hostname + appname + message


    def parse(self, line):
    parsed = self.__pattern.parseString(line)

    payload              = {}
    #payload["priority"]  = parsed[0]
    payload["timestamp"] = strftime("%Y-%m-%d %H:%M:%S")
    payload["hostname"]  = parsed[3]
    payload["appname"]   = parsed[4]
    payload["pid"]       = parsed[5]
    payload["message"]   = parsed[6]


    return payload


def main():

    parser = Parser()

    with open('./messages.log') as syslogFile:

        list1 = [] 
        for line in syslogFile:
            fields = parser.parse(line)
            list1.append(fields)

        return list1


if __name__ == "__main__":

    main()

Following is the sample of different different logs need to be parsed:

Mar  7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar  7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar  7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar  7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar  7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar  8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar  7 21:23:22 avas dccifd[6191]: missing message body
Mar  9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar  9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar  9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar  9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar  8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar  8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar  8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar  8 15:18:40 avas: last message repeated 11 times

Please suggest me what should I do?

3
  • You mean something like this? gist.github.com/leandrosilva/3651640 Commented Dec 14, 2016 at 11:14
  • Yup, like that but since in my log file log format is not the same always. I get an error: List Out of Index, while parsing the following line. "Mar 8 15:18:40 avas: last message repeated 11 times" Commented Dec 14, 2016 at 12:12
  • You could always try except on an IndexError Commented Dec 14, 2016 at 12:25

1 Answer 1

2

To handle this new line, I used the pyparsing Optional class to mark the appname part as optional, and split out the trailing ':'. In the code below, I also did a few tweaks, some parse actions for parse-time data conversion, and some results names to simplify creating the resulting dict in the parse() method.

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex, Optional

from datetime import datetime

class Parser(object):
    # log lines don't include the year, but if we don't provide one, datetime.strptime will assume 1900
    ASSUMED_YEAR = '2016'

    def __init__(self):
        ints = Word(nums)

        # priority
       # priority = Suppress("<") + ints + Suppress(">")

        # timestamp
        month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
        day   = ints
        hour  = Combine(ints + ":" + ints + ":" + ints)

        timestamp = month + day + hour
        # a parse action will convert this timestamp to a datetime
        timestamp.setParseAction(lambda t: datetime.strptime(Parser.ASSUMED_YEAR + ' ' + ' '.join(t), '%Y %b %d %H:%M:%S'))

        # hostname
        hostname = Word(alphas + nums + "_-.")

        # appname
        appname = Word(alphas + "/-_.()")("appname") + (Suppress("[") + ints("pid") + Suppress("]")) | (Word(alphas + "/-_.")("appname"))
        appname.setName("appname")

        # message
        message = Regex(".*")

        # pattern build
        # (add results names to make it easier to access parsed fields)
        self._pattern = timestamp("timestamp") + hostname("hostname") + Optional(appname) + Suppress(':') + message("message")

    def parse(self, line):
        parsed = self._pattern.parseString(line)
        # fill in keys that might not have been found in the input string
        # (this could have been done in a parse action too, then this method would
        # have just been a two-liner)
        for key in 'appname pid'.split():
            if key not in parsed:
                parsed[key] = ''
        return parsed.asDict()

Use runTests() to test your parser against specific test inputs:

pattern = Parser()._pattern

tests = """\
Mar  7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar  7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar  7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar  7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar  7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar  8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar  7 21:23:22 avas dccifd[6191]: missing message body
Mar  9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar  9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar  9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar  9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar  8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar  8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar  8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar  8 15:18:40 avas: last message repeated 11 times"""

pattern.runTests(tests)

Gives:

Mar  7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
[datetime.datetime(2016, 3, 7, 4, 2, 16), 'avas', 'clamd', '11165', '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND'
- pid: '11165'
- timestamp: datetime.datetime(2016, 3, 7, 4, 2, 16)


Mar  7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
[datetime.datetime(2016, 3, 7, 4, 5, 55), 'avas', 'clamd', '11240', '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND'
- pid: '11240'
- timestamp: datetime.datetime(2016, 3, 7, 4, 5, 55)


Mar  7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
[datetime.datetime(2016, 3, 7, 9, 0, 51), 'avas', 'clamd', '27173', 'SelfCheck: Database status OK.']
- appname: 'clamd'
- hostname: 'avas'
- message: 'SelfCheck: Database status OK.'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 9, 0, 51)


Mar  7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
[datetime.datetime(2016, 3, 7, 5, 59, 2), 'avas', 'clamd', '27173', 'Database correctly reloaded (20400 viruses)']
- appname: 'clamd'
- hostname: 'avas'
- message: 'Database correctly reloaded (20400 viruses)'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 5, 59, 2)


Mar  7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
[datetime.datetime(2016, 3, 7, 11, 14, 35), 'avas', 'dccd', '13284', '21 requests/sec are too many from anonymous 205.201.1.56,2246']
- appname: 'dccd'
- hostname: 'avas'
- message: '21 requests/sec are too many from anonymous 205.201.1.56,2246'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 7, 11, 14, 35)


Mar  8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
[datetime.datetime(2016, 3, 8, 0, 22, 57), 'avas', 'dccifd', '9933', 'write(MTA socket,4): Broken pipe']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'write(MTA socket,4): Broken pipe'
- pid: '9933'
- timestamp: datetime.datetime(2016, 3, 8, 0, 22, 57)


Mar  7 21:23:22 avas dccifd[6191]: missing message body
[datetime.datetime(2016, 3, 7, 21, 23, 22), 'avas', 'dccifd', '6191', 'missing message body']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'missing message body'
- pid: '6191'
- timestamp: datetime.datetime(2016, 3, 7, 21, 23, 22)


Mar  9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
[datetime.datetime(2016, 3, 9, 16, 5, 17), 'avas', 'named', '12045', 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53'
- pid: '12045'
- timestamp: datetime.datetime(2016, 3, 9, 16, 5, 17)


Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
[datetime.datetime(2016, 3, 10, 0, 38, 16), 'avas', 'dccifd', '23069', 'continue not asking DCC 17 seconds after failure']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'continue not asking DCC 17 seconds after failure'
- pid: '23069'
- timestamp: datetime.datetime(2016, 3, 10, 0, 38, 16)


Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
[datetime.datetime(2016, 3, 10, 9, 42, 11), 'avas', 'named', 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT'
- timestamp: datetime.datetime(2016, 3, 10, 9, 42, 11)


Mar  9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
[datetime.datetime(2016, 3, 9, 3, 48, 7), 'avas', 'dccd', '145', 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`']
- appname: 'dccd'
- hostname: 'avas'
- message: 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`'
- pid: '145'
- timestamp: datetime.datetime(2016, 3, 9, 3, 48, 7)


Mar  9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
[datetime.datetime(2016, 3, 9, 11, 58, 18), 'avas', 'kernel', 'i810_audio: Connection 0 with codec id 2']
- appname: 'kernel'
- hostname: 'avas'
- message: 'i810_audio: Connection 0 with codec id 2'
- timestamp: datetime.datetime(2016, 3, 9, 11, 58, 18)


Mar  9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
[datetime.datetime(2016, 3, 9, 19, 41, 13), 'avas', 'dccd', '3004', '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577']
- appname: 'dccd'
- hostname: 'avas'
- message: '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577'
- pid: '3004'
- timestamp: datetime.datetime(2016, 3, 9, 19, 41, 13)


Mar  8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
[datetime.datetime(2016, 3, 8, 9, 1, 7), 'avas', 'sshd(pam_unix)', '21839', 'session opened for user tom by (uid=35567)']
- appname: 'sshd(pam_unix)'
- hostname: 'avas'
- message: 'session opened for user tom by (uid=35567)'
- pid: '21839'
- timestamp: datetime.datetime(2016, 3, 8, 9, 1, 7)


Mar  8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
[datetime.datetime(2016, 3, 8, 3, 52, 4), 'avas', 'dccd', '13284', '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window']
- appname: 'dccd'
- hostname: 'avas'
- message: '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 8, 3, 52, 4)


Mar  8 16:05:26 avas arpwatch: listening on eth0
[datetime.datetime(2016, 3, 8, 16, 5, 26), 'avas', 'arpwatch', 'listening on eth0']
- appname: 'arpwatch'
- hostname: 'avas'
- message: 'listening on eth0'
- timestamp: datetime.datetime(2016, 3, 8, 16, 5, 26)


Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
[datetime.datetime(2016, 3, 10, 10, 0, 6), 'avas', 'named', '6986', 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 6)


Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
[datetime.datetime(2016, 3, 10, 10, 0, 10), 'avas', 'named', '6986', 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 10)

Mar  8 15:18:40 avas: last message repeated 11 times
[datetime.datetime(2016, 3, 8, 15, 18, 40), 'avas', 'last message repeated 11 times']
- hostname: 'avas'
- message: 'last message repeated 11 times'
- timestamp: datetime.datetime(2016, 3, 8, 15, 18, 40)

Or using the parse() method of the Parser class:

from pprint import pprint
for t in tests.splitlines():
    pprint(Parser().parse(t))
    print()

gives:

{'appname': 'clamd',
 'hostname': 'avas',
 'message': '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: '
        'Worm.Mydoom.F FOUND ',
 'pid': '11165',
 'timestamp': datetime.datetime(2016, 3, 7, 4, 2, 16)}

{'appname': 'clamd',
 'hostname': 'avas',
 'message': '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: '
        'Worm.SomeFool.Gen-1 FOUND ',
 'pid': '11240',
 'timestamp': datetime.datetime(2016, 3, 7, 4, 5, 55)}

{'appname': 'clamd',
 'hostname': 'avas',
 'message': 'SelfCheck: Database status OK.',
 'pid': '27173',
 'timestamp': datetime.datetime(2016, 3, 7, 9, 0, 51)}

{'appname': 'clamd',
 'hostname': 'avas',
 'message': 'Database correctly reloaded (20400 viruses) ',
 'pid': '27173',
 'timestamp': datetime.datetime(2016, 3, 7, 5, 59, 2)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': '21 requests/sec are too many from anonymous 205.201.1.56,2246',
 'pid': '13284',
 'timestamp': datetime.datetime(2016, 3, 7, 11, 14, 35)}

{'appname': 'dccifd',
 'hostname': 'avas',
 'message': 'write(MTA socket,4): Broken pipe',
 'pid': '9933',
 'timestamp': datetime.datetime(2016, 3, 8, 0, 22, 57)}

{'appname': 'dccifd',
 'hostname': 'avas',
 'message': 'missing message body',
 'pid': '6191',
 'timestamp': datetime.datetime(2016, 3, 7, 21, 23, 22)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
        '10.0.0.253#53',
 'pid': '12045',
 'timestamp': datetime.datetime(2016, 3, 9, 16, 5, 17)}

{'appname': 'dccifd',
 'hostname': 'avas',
 'message': 'continue not asking DCC 17 seconds after failure',
 'pid': '23069',
 'timestamp': datetime.datetime(2016, 3, 10, 0, 38, 16)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'client 127.0.0.1#55524: query: '
        '23.68.27.142.sa-trusted.bondedsender.org IN TXT',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 10, 9, 42, 11)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': 'automatic dbclean; starting `dbclean -DPq -i 1189 -L '
        'info,local5.notice -L error,local5.err`',
 'pid': '145',
 'timestamp': datetime.datetime(2016, 3, 9, 3, 48, 7)}

{'appname': 'kernel',
 'hostname': 'avas',
 'message': 'i810_audio: Connection 0 with codec id 2',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 9, 11, 58, 18)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': '"packet length 44 too small for REPORT" sent to client 1 at '
        '194.63.250.215,47577',
 'pid': '3004',
 'timestamp': datetime.datetime(2016, 3, 9, 19, 41, 13)}

{'appname': 'sshd(pam_unix)',
 'hostname': 'avas',
 'message': 'session opened for user tom by (uid=35567)',
 'pid': '21839',
 'timestamp': datetime.datetime(2016, 3, 8, 9, 1, 7)}

{'appname': 'dccd',
 'hostname': 'avas',
 'message': '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window',
 'pid': '13284',
 'timestamp': datetime.datetime(2016, 3, 8, 3, 52, 4)}

{'appname': 'arpwatch',
 'hostname': 'avas',
 'message': 'listening on eth0',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 8, 16, 5, 26)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
        '192.75.26.21#53',
 'pid': '6986',
 'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 6)}

{'appname': 'named',
 'hostname': 'avas',
 'message': 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX',
 'pid': '6986',
 'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 10)}

{'appname': '',
 'hostname': 'avas',
 'message': 'last message repeated 11 times',
 'pid': '',
 'timestamp': datetime.datetime(2016, 3, 8, 15, 18, 40)}
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.