apachelog.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. from pathlib import Path
  2. import re
  3. from datetime import datetime
  4. import xml.etree.ElementTree as ET
  5. import csv
  6. base_dir = str(Path(__file__).parent)
  7. def convert_log(line):
  8. # 10.6.173.69 - - [01/Dec/2020:10:03:16 +0100] "GET /GAPS_BMW/index.php5?&rc=MISView&rm=getReport&ras[]=49331 HTTP/1.1" 200 64512
  9. match = list(re.findall(r"([\d\.]+) - - \[(.*)\] \"GET /.*&ras\[\]=(\d+) .* 200 (\d+)", line)[0])
  10. match[1] = datetime.strptime(match[1], "%d/%b/%Y:%H:%M:%S %z").isoformat(timespec="seconds")
  11. return match
  12. def parse_log(filename):
  13. with open(filename, "r") as frh:
  14. logs = [convert_log(line) for line in frh.readlines() if line.find("rc=MISView&rm=getReport") > -1]
  15. return logs
  16. def parse_portal_xml(filename):
  17. portal = ET.parse(filename)
  18. folders = portal.getroot().find("Publishes").findall("Publish")
  19. res = {}
  20. for p in folders:
  21. user = p.get("User")
  22. name = p.get("Name")
  23. for i in p.find("Images").findall("Image"):
  24. res[i.get("ID")] = [user, name, i.find("Report").text]
  25. return res
  26. def combine_logs_reports(logs, reports):
  27. return [line + reports.get(line[2], [""] * 3) for line in logs]
  28. def main():
  29. reports = parse_portal_xml(base_dir + "/config/GAPS_BMW_NEU.xml")
  30. logs = parse_log(base_dir + "/apache/access.log")
  31. header = ["ip", "timestamp", "id", "bytes", "user", "section", "report"]
  32. logs = combine_logs_reports(logs, reports)
  33. with open(base_dir + "/export/accesslog.csv", "w") as fwh:
  34. csv_writer = csv.writer(fwh, delimiter=";")
  35. csv_writer.writerow(header)
  36. csv_writer.writerows(logs)
  37. print(len(logs))
  38. print(logs[0])
  39. if __name__ == "__main__":
  40. main()