| #!/usr/bin/env python3 |
| |
| # This file is part of the LibreOffice project. |
| # |
| # This Source Code Form is subject to the terms of the Mozilla Public |
| # License, v. 2.0. If a copy of the MPL was not distributed with this |
| # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| # Use this script to retrieve information from https://crashreport.libreoffice.org |
| # about a specific version of LibreOffice |
| # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/ |
| |
| import argparse |
| import requests |
| from bs4 import BeautifulSoup |
| import sys |
| import os |
| from datetime import datetime |
| import urllib.parse |
| import re |
| import git |
| |
| tableHeader = ["", "Name", "Ratio", "Count", "First report", "Last Report", |
| "OS", "Stack", "Reason", "Last 4 UNO Commands"] |
| |
| HtmlHeader = \ |
| '<!DOCTYPE html> \ |
| <html lang=\"en\"> \ |
| <head> \ |
| <title>%VERSION% crashes</title> \ |
| <style> \ |
| table { \ |
| width: 100%; \ |
| border-collapse: collapse; \ |
| } \ |
| a { text-decoration:none; color: inherit; } \ |
| th, #td1 { \ |
| padding: 8px 12px; \ |
| border: 1px solid #ccc; \ |
| text-align: left; \ |
| } \ |
| th { \ |
| cursor: pointer; \ |
| background-color: #f2f2f2; \ |
| } \ |
| th:hover { \ |
| background-color: #ddd; \ |
| } \ |
| </style> \ |
| </head> \ |
| <body> \ |
| <h2>%VERSION% crashes</h2> \ |
| <script src="https://www.kryogenix.org/code/browser/sorttable/sorttable.js"></script>' |
| |
| def convert_str_to_date(value): |
| value = value.replace('.', '') |
| value = value.replace('March', 'Mar') |
| value = value.replace('April', 'Apr') |
| value = value.replace('June', 'Jun') |
| value = value.replace('July', 'Jul') |
| value = value.replace('Sept', 'Sep') |
| # reset the time leaving the date |
| value = ", ".join(value.split(", ")[:-1]) |
| return datetime.strptime(value, '%b %d, %Y') |
| |
| def parse_version_url(version, session): |
| crashReports = {} |
| url = "https://crashreport.libreoffice.org/stats/version/" + version + "?limit=1000&days=30" |
| |
| try: |
| html_text = session.get(url, timeout=200).text |
| soup = BeautifulSoup(html_text, 'html.parser') |
| except requests.exceptions.Timeout: |
| print("Timeout requesting " + url) |
| sys.exit(1) |
| |
| table = soup.find("table", {"id": "data-table"}).tbody |
| for tr in table.find_all("tr"): |
| td_list = tr.find_all("td") |
| crashName = td_list[0].a.text.strip() |
| crashNumber = int(td_list[1].text.strip()) |
| firstCrashDate = convert_str_to_date(td_list[5].text.strip()) |
| lastCrashDate = convert_str_to_date(td_list[6].text.strip()) |
| crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate] |
| |
| return crashReports |
| |
| def parse_reports_and_get_most_recent_report_from_last_page(signature, session): |
| try: |
| url = "https://crashreport.libreoffice.org/stats/signature/" + signature |
| html_text = session.get(url, timeout=200).text |
| soup = BeautifulSoup(html_text, 'html.parser') |
| except requests.exceptions.Timeout: |
| print("Timeout") |
| raise |
| |
| count = 0 |
| ID, OS = "", "" |
| try: |
| os_tab = soup.find("table", {"id": "os_tab"}).tbody |
| except AttributeError: |
| print("os_tab not found") |
| return count, ID, OS |
| |
| tr_list = os_tab.find_all("tr") |
| for tr in tr_list: |
| td_list = tr.find_all("td") |
| count += int(td_list[1].text.strip()) |
| |
| reports = soup.find("div", {"id": "reports"}).tbody |
| version, currentVersion = 0, 0 |
| currentID, currentOS = "", "" |
| |
| tr_list = reports.find_all("tr") |
| for tr in tr_list: |
| td_list = tr.find_all("td") |
| |
| currentID = td_list[0].a.text.strip() |
| currentVersion = int(''.join(re.findall(r"\d+", td_list[2].text))) |
| currentOS = td_list[3].text.strip() |
| |
| # get most recent version |
| # symbols on linux are not very informative generally |
| if currentOS == "windows" and currentVersion > version: |
| version = currentVersion |
| ID = currentID |
| OS = currentOS |
| |
| if not ID: |
| ID = currentID |
| |
| if not OS: |
| OS = currentOS |
| |
| return count, ID, OS |
| |
| def parse_details_and_get_info(crashId, session, gitRepo, gitBranch): |
| try: |
| url = "https://crashreport.libreoffice.org/stats/crash_details/" + crashID |
| html_text = session.get(url, timeout=200).text |
| soup = BeautifulSoup(html_text, 'html.parser') |
| except requests.exceptions.Timeout: |
| print("Timeout") |
| raise |
| |
| details = soup.find("div", {"id": "details-tab-panel"}).tbody |
| tr_list = details.find_all("tr") |
| reason = tr_list[8].td.text.strip() |
| |
| stackTable = "<table>" |
| |
| count = 0 |
| frames = soup.find("div", {"id": "frames"}).tbody |
| for tr in frames.find_all("tr"): |
| td_list = tr.find_all("td") |
| source = td_list[3].text.strip() |
| if source and count <= 10: |
| source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "") |
| |
| codeFile = source.split(":")[0] |
| codeNumber = source.split(":")[1] |
| codeLine = "<tr><td>" |
| try: |
| with open(os.path.join(gitRepo, codeFile)) as f: |
| lines = f.readlines() |
| for index, line in enumerate(lines): |
| if index + 1 == int(codeNumber): |
| urlLink = "https://git.libreoffice.org/core/+/" + \ |
| gitBranch + "/" + codeFile + "#" + str(codeNumber) |
| codeLine += str(count) + ": <a target=\"_blank\" href=\"" + urlLink + "\">" + line.strip().replace("\"", "'") + "</a>" |
| count += 1 |
| except FileNotFoundError: |
| continue |
| codeLine += "</tr></td>" |
| stackTable += codeLine |
| |
| stackTable += "</table>" |
| |
| metadata = soup.find("div", {"id": "metadata-tab-panel"}).tbody |
| tr_list = metadata.find_all("tr") |
| unoCommands = "" |
| for tr in tr_list: |
| if tr.th.text.strip() == "Last-4-Uno-Commands": |
| unoCommands = tr.td.text.strip() |
| |
| return reason, stackTable, unoCommands |
| |
| if __name__ == '__main__': |
| |
| parser = argparse.ArgumentParser() |
| |
| parser.add_argument('--version', action='store', dest="version", required=True) |
| parser.add_argument('--repository', action="store", dest="repository", required=True) |
| |
| args = parser.parse_args() |
| |
| gitBranch = git.Repo(args.repository).active_branch.name |
| |
| session = requests.Session() |
| session.headers.update({'Referer': 'https://crashreport.libreoffice.org'}) |
| |
| crashes = parse_version_url(args.version, session) |
| |
| print(str(len(crashes)) + " crash reports in version " + args.version) |
| |
| crashesInFile = [] |
| fileName = "crashes_" + args.version.replace(".", "_") + ".html" |
| print("Using " + fileName) |
| |
| with open(fileName, "w") as f: |
| f.write(HtmlHeader.replace("%VERSION%", args.version)) |
| f.write("<table class=\"sortable\">") |
| f.write("<thead>") |
| f.write("<tr>") |
| for name in tableHeader: |
| f.write("<th>" + name + "</th>") |
| f.write("</tr>") |
| f.write("</thead>") |
| f.flush() |
| |
| f.write("<tbody>") |
| count = 0 |
| for k, lDate in crashes.items(): |
| if k not in crashesInFile: |
| print("Parsing " + k) |
| f.write("<tr>") |
| try: |
| crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page( |
| urllib.parse.quote(k), session) |
| if crashCount == 0: |
| continue |
| |
| crashReason, codeStack, unoCommands = parse_details_and_get_info( |
| crashID, session, args.repository, gitBranch) |
| ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2) |
| count += 1 |
| f.write("<td id=\"td1\">" + str(count) + "</td>") |
| f.write("<td id=\"td1\"><b><a target=\"_blank\" href=\"https://crashreport.libreoffice.org/stats/crash_details/" |
| + crashID + "\">" + k + "</a></b></td>") |
| f.write("<td id=\"td1\">" + str(ratio) + "</td>") |
| f.write("<td id=\"td1\">" + str(crashCount) + "</td>") |
| f.write("<td id=\"td1\">" + lDate[1].strftime('%Y/%m/%d') + "</td>") |
| f.write("<td id=\"td1\">" + lDate[2].strftime('%Y/%m/%d') + "</td>") |
| f.write("<td id=\"td1\">" + crashOS + "</td>") |
| f.write("<td id=\"td1\">" + codeStack + "</td>") |
| f.write("<td id=\"td1\">" + crashReason + "</td>") |
| f.write("<td id=\"td1\">" + unoCommands + "</td>") |
| except (requests.exceptions.Timeout): |
| continue |
| f.write("</tr>") |
| f.flush() |
| |
| f.write("</tbody>") |
| f.write("</table>") |
| f.write("</body>") |
| f.write("</html>") |