{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "from dateutil.parser import parse as parse_date\n", "import numpy as np\n", "from datetime import datetime\n", "import re\n", "import json" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "datetime.datetime(2018, 12, 19, 22, 49, tzinfo=tzutc())" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdate = parse_date(\"Incident on 2019-12-19 22:49 UTC\".split(\"on\")[1])\n", "pdate.replace(year=2018)" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'event_start_time': 1576795740,\n", " 'event_end_time': 1576797120,\n", " 'status': 'minor'}]" ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Parser for github status till september 2, 2020\n", "# https://github.blog/2020-09-02-github-availability-report-august-2020/\n", "\n", "def get_incident_year(incident_title):\n", " try:\n", " current_node = incident_title.parent.previous_sibling\n", " while type(current_node) != bs4.element.Tag or current_node.find(attrs={'data-var':'year'}) == None:\n", " current_node = current_node.previous_sibling\n", " return int(current_node.find(attrs={'data-var':'year'}).get_text())\n", " except:\n", " print(incident_title.parent.previous_sibling.previous_sibling)\n", " raise\n", "\n", "def get_incident_type(incident_title):\n", " classes = incident_title.attrs['class']\n", " if 'impact-minor' in classes:\n", " return 'minor'\n", " elif 'impact-none' in classes:\n", " return 'minor'\n", " elif 'impact-major' in classes:\n", " return 'major'\n", " elif 'impact-critical' in classes:\n", " return 'critical'\n", " elif 'impact-maintenance' in classes:\n", " return 'maintenance'\n", " else:\n", " raise Exception('Unknown impact class: ' + str(incident_title))\n", " \n", "def get_maintenance_incident_details(incident_update):\n", " raise Exception('Unimplemented. Implement when we encounter one')\n", " \n", "def get_normal_incident_details(incident_updates, year):\n", " try:\n", " problem = False\n", " investigating_divs = incident_updates.select('.investigating')\n", " if len(investigating_divs) == 1:\n", " incident_start_time_str = investigating_divs[0].select('small')[0].get_text().strip()\n", " incident_start_time = parse_date(incident_start_time_str).replace(year=year).timestamp()\n", " else:\n", " incident_start_time = -1\n", " problem = True\n", "\n", " incident_end_time_str = incident_updates.select('.resolved')[0].select('small')[0].get_text().strip()\n", " incident_end_time = parse_date(incident_end_time_str).replace(year=year).timestamp()\n", "\n", " ret_data = {\n", " 'event_start_time': np.int64(incident_start_time),\n", " 'event_end_time': np.int64(incident_end_time)\n", " }\n", " \n", " if problem:\n", " ret_data['status'] = incident_updates.get_text()\n", " \n", " return ret_data\n", " \n", " except:\n", " print(incident_updates)\n", " raise\n", "\n", "def process_github_page(file_obj, metadata):\n", " status_data = BeautifulSoup(file_obj.read(), 'lxml')\n", " \n", " incident_types = ['impact-minor', 'impact-none', 'impact-major', 'impact-critical', 'impact-maintenance']\n", "\n", " incidents_titles = status_data.findAll(attrs={'class': lambda x: x and 'incident-title' in x and any(incident_type in x for incident_type in incident_types)})\n", " incidents_updates = status_data.select('.updates-container')\n", " if len(incidents_titles) != len(incidents_updates):\n", " raise Exception('Number of titles does not match number of updates containers: ' + metadata['date'])\n", " \n", " extracted_incident_info = []\n", " for i in range(len(incidents_titles)):\n", " incident_type = get_incident_type(incidents_titles[i])\n", " incident_year = get_incident_year(incidents_titles[i])\n", "\n", " incident_details = None\n", " if incident_type == 'maintenance':\n", " incident_details = get_maintenance_incident_details(incidents_updates[i])\n", " else:\n", " incident_details = get_normal_incident_details(incidents_updates[i], incident_year)\n", "\n", " # Using incident field for error if required\n", " if 'status' not in incident_details:\n", " incident_details['status'] = incident_type\n", "\n", " extracted_incident_info.append(incident_details)\n", "\n", " return extracted_incident_info\n", "\n", "status_data = None\n", "with open('github-global-status.html') as f:\n", " status_data = process_github_page(f, {})\n", "status_data" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'event_start_time': 1576795740,\n", " 'event_end_time': 1576797120,\n", " 'status': 'minor'}]" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "status_data = None\n", "with open('github-global-status.html') as f:\n", " status_data = process_github_page(f, {})\n", "status_data" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['', 'github-status', '2020', '11', '20201107', 'zip']" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "re.compile('[./]').split(\"/github-status/2020/11/20201107.zip\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "def extract_individual_reports(file_obj):\n", " html = str(file_obj.read())\n", " if html is None:\n", " raise ValueError(f'ERROR: HTML IS NONE \\n filename {filename} \\t file_obj {file_obj} ')\n", " \n", " # Get list of reasons\n", " matches = re.search('\"outage_subjects\":(.*?)],\"', html)\n", " list_of_reasons = json.loads(matches.group(1) + ']')\n", " map_of_reasons = {}\n", " for reason in list_of_reasons:\n", " map_of_reasons[reason['_id']] = reason['name']\n", "\n", " matches = re.search('\"recentReports\":(.*?)],\"_', html)\n", " reports = []\n", " if matches is not None:\n", " # The data is in embedded javascript. Remove constructors to make it JSON\n", " # We used the closing brackets in the regex and don't capture them, adding them back again\n", " sanitized_match = re.sub(r'new Date\\(\".+?\"\\)', '\"\"', matches.group(1)) + ']'\n", " reports = json.loads(sanitized_match)\n", "\n", " \n", " report_id = []\n", " report_country = []\n", " report_reason = []\n", " for datapoint in reports:\n", " try:\n", " report_id.append(datapoint['_id'])\n", " \n", " if 'country_code' in datapoint:\n", " report_country.append(datapoint['country_code'])\n", " elif 'countryCode' in datapoint:\n", " report_country.append(datapoint['countryCode'])\n", " else:\n", " raise Exception('Country code not found')\n", " \n", " if 'oSubjectId' in datapoint:\n", " report_reason.append(map_of_reasons[datapoint['oSubjectId']])\n", " else:\n", " reason_found = False\n", " for name, value in datapoint.items():\n", " if 'subject' in name or 'Subject' in name:\n", " report_reason.append(map_of_reasons[datapoint[name]])\n", " reason_found = True\n", " if not reason_found:\n", " report_reason.append('Unknown')\n", " except:\n", " print(datapoint)\n", " raise\n", "\n", " return pd.DataFrame({\n", " 'id': report_id,\n", " 'country':report_country,\n", " 'reason': report_reason\n", " })" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id | \n", "country | \n", "reason | \n", "
|---|---|---|---|
| 0 | \n", "5fc3c224dcd8580a7973cd8c | \n", "RU | \n", "Can't login | \n", "
| 1 | \n", "5fc3c11dfb995a344b9b68ef | \n", "TR | \n", "Newsfeed | \n", "
| 2 | \n", "5fc3bef2fb995a344b9b68ed | \n", "KZ | \n", "Mobile app crashes | \n", "
| 3 | \n", "5fc3bb19ae91e3734cd1777b | \n", "US | \n", "Can't upload pics | \n", "
| 4 | \n", "5fc3b738dcd8580a7973cd80 | \n", "GR | \n", "Website down | \n", "
| 5 | \n", "5fc3b66bd4d3565ae19b14b5 | \n", "RU | \n", "Mobile app not working | \n", "