bin/crashreportScraper.py - core - Gitiles

 #!/usr/bin/env python3

 # This file is part of the LibreOffice project.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

 # Use this script to retrieve information from https://crashreport.libreoffice.org
 # about a specific version of LibreOffice
 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/

 import argparse
 import requests
 from bs4 import BeautifulSoup
 import sys
 import os
 from datetime import datetime
 import urllib.parse
 import re
 import git

 tableHeader = ["", "Name", "Ratio", "Count", "First report", "Last Report",
           "OS", "Stack", "Reason", "Last 4 UNO Commands"]

 HtmlHeader = \
 '<!DOCTYPE html> \
 <html lang=\"en\"> \
 <head> \
     <title>%VERSION% crashes</title> \
     <style> \
         table { \
             width: 100%; \
             border-collapse: collapse; \
         } \
         a { text-decoration:none; color: inherit; } \
         th, #td1 { \
             padding: 8px 12px; \
             border: 1px solid #ccc; \
             text-align: left; \
         } \
         th { \
             cursor: pointer; \
             background-color: #f2f2f2; \
         } \
         th:hover { \
             background-color: #ddd; \
         } \
     </style> \
 </head> \
 <body> \
 <h2>%VERSION% crashes</h2> \
 <script src="https://www.kryogenix.org/code/browser/sorttable/sorttable.js"></script>'

 def convert_str_to_date(value):
     value = value.replace('.', '')
     value = value.replace('March', 'Mar')
     value = value.replace('April', 'Apr')
     value = value.replace('June', 'Jun')
     value = value.replace('July', 'Jul')
     value = value.replace('Sept', 'Sep')
     # reset the time leaving the date
     value = ", ".join(value.split(", ")[:-1])
     return datetime.strptime(value, '%b %d, %Y')

 def parse_version_url(version, session):
     crashReports = {}
     url = "https://crashreport.libreoffice.org/stats/version/" + version + "?limit=1000&days=30"

     try:
         html_text = session.get(url, timeout=200).text
         soup = BeautifulSoup(html_text, 'html.parser')
     except requests.exceptions.Timeout:
         print("Timeout requesting " + url)
         sys.exit(1)

     table = soup.find("table", {"id": "data-table"}).tbody
     for tr in table.find_all("tr"):
         td_list = tr.find_all("td")
         crashName = td_list[0].a.text.strip()
         crashNumber = int(td_list[1].text.strip())
         firstCrashDate = convert_str_to_date(td_list[5].text.strip())
         lastCrashDate = convert_str_to_date(td_list[6].text.strip())
         crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]

     return crashReports

 def parse_reports_and_get_most_recent_report_from_last_page(signature, session):
     try:
         url = "https://crashreport.libreoffice.org/stats/signature/" + signature
         html_text = session.get(url, timeout=200).text
         soup = BeautifulSoup(html_text, 'html.parser')
     except requests.exceptions.Timeout:
         print("Timeout")
         raise

     count = 0
     ID, OS = "", ""
     try:
         os_tab = soup.find("table", {"id": "os_tab"}).tbody
     except AttributeError:
         print("os_tab not found")
         return count, ID, OS

     tr_list = os_tab.find_all("tr")
     for tr in tr_list:
         td_list = tr.find_all("td")
         count += int(td_list[1].text.strip())

     reports = soup.find("div", {"id": "reports"}).tbody
     version, currentVersion = 0, 0
     currentID, currentOS = "", ""

     tr_list = reports.find_all("tr")
     for tr in tr_list:
         td_list = tr.find_all("td")

         currentID = td_list[0].a.text.strip()
         currentVersion = int(''.join(re.findall(r"\d+", td_list[2].text)))
         currentOS = td_list[3].text.strip()

         # get most recent version
         # symbols on linux are not very informative generally
         if currentOS == "windows" and currentVersion > version:
             version = currentVersion
             ID = currentID
             OS = currentOS

     if not ID:
         ID = currentID

     if not OS:
         OS = currentOS

     return count, ID, OS

 def parse_details_and_get_info(crashId, session, gitRepo, gitBranch):
     try:
         url = "https://crashreport.libreoffice.org/stats/crash_details/" + crashID
         html_text = session.get(url, timeout=200).text
         soup = BeautifulSoup(html_text, 'html.parser')
     except requests.exceptions.Timeout:
         print("Timeout")
         raise

     details = soup.find("div", {"id": "details-tab-panel"}).tbody
     tr_list = details.find_all("tr")
     reason = tr_list[8].td.text.strip()

     stackTable = "<table>"

     count = 0
     frames = soup.find("div", {"id": "frames"}).tbody
     for tr in frames.find_all("tr"):
         td_list = tr.find_all("td")
         source = td_list[3].text.strip()
         if source and count <= 10:
             source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")

             codeFile = source.split(":")[0]
             codeNumber = source.split(":")[1]
             codeLine = "<tr><td>"
             try:
                 with open(os.path.join(gitRepo, codeFile)) as f:
                     lines = f.readlines()
                     for index, line in enumerate(lines):
                         if index + 1 == int(codeNumber):
                             urlLink = "https://git.libreoffice.org/core/+/" + \
                                 gitBranch + "/" + codeFile + "#" + str(codeNumber)
                             codeLine += str(count) + ": <a target=\"_blank\" href=\"" + urlLink + "\">" + line.strip().replace("\"", "'") + "</a>"
                             count += 1
             except FileNotFoundError:
                 continue
             codeLine += "</tr></td>"
             stackTable += codeLine

     stackTable += "</table>"

     metadata = soup.find("div", {"id": "metadata-tab-panel"}).tbody
     tr_list = metadata.find_all("tr")
     unoCommands = ""
     for tr in tr_list:
         if tr.th.text.strip() == "Last-4-Uno-Commands":
             unoCommands = tr.td.text.strip()

     return reason, stackTable, unoCommands

 if __name__ == '__main__':

     parser = argparse.ArgumentParser()

     parser.add_argument('--version', action='store', dest="version", required=True)
     parser.add_argument('--repository', action="store", dest="repository", required=True)

     args = parser.parse_args()

     gitBranch = git.Repo(args.repository).active_branch.name

     session = requests.Session()
     session.headers.update({'Referer': 'https://crashreport.libreoffice.org'})

     crashes = parse_version_url(args.version, session)

     print(str(len(crashes)) + " crash reports in version " + args.version)

     crashesInFile = []
     fileName = "crashes_" + args.version.replace(".", "_") + ".html"
     print("Using " + fileName)

     with open(fileName, "w") as f:
         f.write(HtmlHeader.replace("%VERSION%", args.version))
         f.write("<table class=\"sortable\">")
         f.write("<thead>")
         f.write("<tr>")
         for name in tableHeader:
             f.write("<th>" + name + "</th>")
         f.write("</tr>")
         f.write("</thead>")
         f.flush()

         f.write("<tbody>")
         count = 0
         for k, lDate in crashes.items():
             if k not in crashesInFile:
                 print("Parsing " + k)
                 f.write("<tr>")
                 try:
                     crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
                             urllib.parse.quote(k), session)
                     if crashCount == 0:
                         continue

                     crashReason, codeStack, unoCommands = parse_details_and_get_info(
                             crashID, session, args.repository, gitBranch)
                     ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2)
                     count += 1
                     f.write("<td id=\"td1\">" + str(count) + "</td>")
                     f.write("<td id=\"td1\"><b><a target=\"_blank\" href=\"https://crashreport.libreoffice.org/stats/crash_details/"
                         + crashID + "\">" + k + "</a></b></td>")
                     f.write("<td id=\"td1\">" + str(ratio) + "</td>")
                     f.write("<td id=\"td1\">" + str(crashCount) + "</td>")
                     f.write("<td id=\"td1\">" + lDate[1].strftime('%Y/%m/%d') + "</td>")
                     f.write("<td id=\"td1\">" + lDate[2].strftime('%Y/%m/%d') + "</td>")
                     f.write("<td id=\"td1\">" + crashOS + "</td>")
                     f.write("<td id=\"td1\">" + codeStack + "</td>")
                     f.write("<td id=\"td1\">" + crashReason + "</td>")
                     f.write("<td id=\"td1\">" + unoCommands + "</td>")
                 except (requests.exceptions.Timeout):
                     continue
                 f.write("</tr>")
                 f.flush()

         f.write("</tbody>")
         f.write("</table>")
         f.write("</body>")
         f.write("</html>")
	#!/usr/bin/env python3

	# This file is part of the LibreOffice project.
	#
	# This Source Code Form is subject to the terms of the Mozilla Public
	# License, v. 2.0. If a copy of the MPL was not distributed with this
	# file, You can obtain one at http://mozilla.org/MPL/2.0/.

	# Use this script to retrieve information from https://crashreport.libreoffice.org
	# about a specific version of LibreOffice
	# Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/

	import argparse
	import requests
	from bs4 import BeautifulSoup
	import sys
	import os
	from datetime import datetime
	import urllib.parse
	import re
	import git

	tableHeader = ["", "Name", "Ratio", "Count", "First report", "Last Report",
	"OS", "Stack", "Reason", "Last 4 UNO Commands"]

	HtmlHeader = \
	'<!DOCTYPE html> \
	<html lang=\"en\"> \
	<head> \
	<title>%VERSION% crashes</title> \
	<style> \
	table { \
	width: 100%; \
	border-collapse: collapse; \
	} \
	a { text-decoration:none; color: inherit; } \
	th, #td1 { \
	padding: 8px 12px; \
	border: 1px solid #ccc; \
	text-align: left; \
	} \
	th { \
	cursor: pointer; \
	background-color: #f2f2f2; \
	} \
	th:hover { \
	background-color: #ddd; \
	} \
	</style> \
	</head> \
	<body> \
	<h2>%VERSION% crashes</h2> \
	<script src="https://www.kryogenix.org/code/browser/sorttable/sorttable.js"></script>'

	def convert_str_to_date(value):
	value = value.replace('.', '')
	value = value.replace('March', 'Mar')
	value = value.replace('April', 'Apr')
	value = value.replace('June', 'Jun')
	value = value.replace('July', 'Jul')
	value = value.replace('Sept', 'Sep')
	# reset the time leaving the date
	value = ", ".join(value.split(", ")[:-1])
	return datetime.strptime(value, '%b %d, %Y')

	def parse_version_url(version, session):
	crashReports = {}
	url = "https://crashreport.libreoffice.org/stats/version/" + version + "?limit=1000&days=30"

	try:
	html_text = session.get(url, timeout=200).text
	soup = BeautifulSoup(html_text, 'html.parser')
	except requests.exceptions.Timeout:
	print("Timeout requesting " + url)
	sys.exit(1)

	table = soup.find("table", {"id": "data-table"}).tbody
	for tr in table.find_all("tr"):
	td_list = tr.find_all("td")
	crashName = td_list[0].a.text.strip()
	crashNumber = int(td_list[1].text.strip())
	firstCrashDate = convert_str_to_date(td_list[5].text.strip())
	lastCrashDate = convert_str_to_date(td_list[6].text.strip())
	crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]

	return crashReports

	def parse_reports_and_get_most_recent_report_from_last_page(signature, session):
	try:
	url = "https://crashreport.libreoffice.org/stats/signature/" + signature
	html_text = session.get(url, timeout=200).text
	soup = BeautifulSoup(html_text, 'html.parser')
	except requests.exceptions.Timeout:
	print("Timeout")
	raise

	count = 0
	ID, OS = "", ""
	try:
	os_tab = soup.find("table", {"id": "os_tab"}).tbody
	except AttributeError:
	print("os_tab not found")
	return count, ID, OS

	tr_list = os_tab.find_all("tr")
	for tr in tr_list:
	td_list = tr.find_all("td")
	count += int(td_list[1].text.strip())

	reports = soup.find("div", {"id": "reports"}).tbody
	version, currentVersion = 0, 0
	currentID, currentOS = "", ""

	tr_list = reports.find_all("tr")
	for tr in tr_list:
	td_list = tr.find_all("td")

	currentID = td_list[0].a.text.strip()
	currentVersion = int(''.join(re.findall(r"\d+", td_list[2].text)))
	currentOS = td_list[3].text.strip()

	# get most recent version
	# symbols on linux are not very informative generally
	if currentOS == "windows" and currentVersion > version:
	version = currentVersion
	ID = currentID
	OS = currentOS

	if not ID:
	ID = currentID

	if not OS:
	OS = currentOS

	return count, ID, OS

	def parse_details_and_get_info(crashId, session, gitRepo, gitBranch):
	try:
	url = "https://crashreport.libreoffice.org/stats/crash_details/" + crashID
	html_text = session.get(url, timeout=200).text
	soup = BeautifulSoup(html_text, 'html.parser')
	except requests.exceptions.Timeout:
	print("Timeout")
	raise

	details = soup.find("div", {"id": "details-tab-panel"}).tbody
	tr_list = details.find_all("tr")
	reason = tr_list[8].td.text.strip()

	stackTable = "<table>"

	count = 0
	frames = soup.find("div", {"id": "frames"}).tbody
	for tr in frames.find_all("tr"):
	td_list = tr.find_all("td")
	source = td_list[3].text.strip()
	if source and count <= 10:
	source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")

	codeFile = source.split(":")[0]
	codeNumber = source.split(":")[1]
	codeLine = "<tr><td>"
	try:
	with open(os.path.join(gitRepo, codeFile)) as f:
	lines = f.readlines()
	for index, line in enumerate(lines):
	if index + 1 == int(codeNumber):
	urlLink = "https://git.libreoffice.org/core/+/" + \
	gitBranch + "/" + codeFile + "#" + str(codeNumber)
	codeLine += str(count) + ": <a target=\"_blank\" href=\"" + urlLink + "\">" + line.strip().replace("\"", "'") + "</a>"
	count += 1
	except FileNotFoundError:
	continue
	codeLine += "</tr></td>"
	stackTable += codeLine

	stackTable += "</table>"

	metadata = soup.find("div", {"id": "metadata-tab-panel"}).tbody
	tr_list = metadata.find_all("tr")
	unoCommands = ""
	for tr in tr_list:
	if tr.th.text.strip() == "Last-4-Uno-Commands":
	unoCommands = tr.td.text.strip()

	return reason, stackTable, unoCommands

	if __name__ == '__main__':

	parser = argparse.ArgumentParser()

	parser.add_argument('--version', action='store', dest="version", required=True)
	parser.add_argument('--repository', action="store", dest="repository", required=True)

	args = parser.parse_args()

	gitBranch = git.Repo(args.repository).active_branch.name

	session = requests.Session()
	session.headers.update({'Referer': 'https://crashreport.libreoffice.org'})

	crashes = parse_version_url(args.version, session)

	print(str(len(crashes)) + " crash reports in version " + args.version)

	crashesInFile = []
	fileName = "crashes_" + args.version.replace(".", "_") + ".html"
	print("Using " + fileName)

	with open(fileName, "w") as f:
	f.write(HtmlHeader.replace("%VERSION%", args.version))
	f.write("<table class=\"sortable\">")
	f.write("<thead>")
	f.write("<tr>")
	for name in tableHeader:
	f.write("<th>" + name + "</th>")
	f.write("</tr>")
	f.write("</thead>")
	f.flush()

	f.write("<tbody>")
	count = 0
	for k, lDate in crashes.items():
	if k not in crashesInFile:
	print("Parsing " + k)
	f.write("<tr>")
	try:
	crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
	urllib.parse.quote(k), session)
	if crashCount == 0:
	continue

	crashReason, codeStack, unoCommands = parse_details_and_get_info(
	crashID, session, args.repository, gitBranch)
	ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2)
	count += 1
	f.write("<td id=\"td1\">" + str(count) + "</td>")
	f.write("<td id=\"td1\"><b><a target=\"_blank\" href=\"https://crashreport.libreoffice.org/stats/crash_details/"
	+ crashID + "\">" + k + "</a></b></td>")
	f.write("<td id=\"td1\">" + str(ratio) + "</td>")
	f.write("<td id=\"td1\">" + str(crashCount) + "</td>")
	f.write("<td id=\"td1\">" + lDate[1].strftime('%Y/%m/%d') + "</td>")
	f.write("<td id=\"td1\">" + lDate[2].strftime('%Y/%m/%d') + "</td>")
	f.write("<td id=\"td1\">" + crashOS + "</td>")
	f.write("<td id=\"td1\">" + codeStack + "</td>")
	f.write("<td id=\"td1\">" + crashReason + "</td>")
	f.write("<td id=\"td1\">" + unoCommands + "</td>")
	except (requests.exceptions.Timeout):
	continue
	f.write("</tr>")
	f.flush()

	f.write("</tbody>")
	f.write("</table>")
	f.write("</body>")
	f.write("</html>")