import re from datetime import datetime import xml.etree.ElementTree as ET import csv base_dir = '/home/robert/projekte/python/logviewer' def convert_log(line): # 10.6.173.69 - - [01/Dec/2020:10:03:16 +0100] "GET /GAPS_BMW/index.php5?&rc=MISView&rm=getReport&ras[]=49331 HTTP/1.1" 200 64512 match = list(re.findall(r"([\d\.]+) - - \[(.*)\] \"GET /.*&ras\[\]=(\d+) .* 200 (\d+)", line)[0]) match[1] = datetime.strptime(match[1], "%d/%b/%Y:%H:%M:%S %z").isoformat(timespec='seconds') return match def parse_log(filename): with open(filename, 'r') as frh: logs = [convert_log(line) for line in frh.readlines() if line.find('rc=MISView&rm=getReport') > -1] return logs def parse_portal_xml(filename): portal = ET.parse(filename) folders = portal.getroot().find('Publishes').findall('Publish') res = {} for p in folders: user = p.get('User') name = p.get('Name') for i in p.find('Images').findall('Image'): res[i.get('ID')] = [user, name, i.find('Report').text] return res def combine_logs_reports(logs, reports): return [line + reports.get(line[2], [''] * 3) for line in logs] def main(): reports = parse_portal_xml(base_dir + '/config/GAPS_BMW_NEU.xml') logs = parse_log(base_dir + '/apache/access.log') header = ['ip', 'timestamp', 'id', 'bytes', 'user', 'section', 'report'] logs = combine_logs_reports(logs, reports) with open(base_dir + '/export/accesslog.csv', 'w') as fwh: csv_writer = csv.writer(fwh, delimiter=';') csv_writer.writerow(header) csv_writer.writerows(logs) print(len(logs)) print(logs[0]) if __name__ == '__main__': main()