{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import bs4\n", "from dateutil.parser import parse as parse_date\n", "import numpy as np\n", "from datetime import datetime\n", "import re\n", "import json" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "datetime.datetime(2018, 12, 19, 22, 49, tzinfo=tzutc())" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdate = parse_date(\"Incident on 2019-12-19 22:49 UTC\".split(\"on\")[1])\n", "pdate.replace(year=2018)" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'event_start_time': 1576795740,\n", " 'event_end_time': 1576797120,\n", " 'status': 'minor'}]" ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Parser for github status till september 2, 2020\n", "# https://github.blog/2020-09-02-github-availability-report-august-2020/\n", "\n", "def get_incident_year(incident_title):\n", " try:\n", " current_node = incident_title.parent.previous_sibling\n", " while type(current_node) != bs4.element.Tag or current_node.find(attrs={'data-var':'year'}) == None:\n", " current_node = current_node.previous_sibling\n", " return int(current_node.find(attrs={'data-var':'year'}).get_text())\n", " except:\n", " print(incident_title.parent.previous_sibling.previous_sibling)\n", " raise\n", "\n", "def get_incident_type(incident_title):\n", " classes = incident_title.attrs['class']\n", " if 'impact-minor' in classes:\n", " return 'minor'\n", " elif 'impact-none' in classes:\n", " return 'minor'\n", " elif 'impact-major' in classes:\n", " return 'major'\n", " elif 'impact-critical' in classes:\n", " return 'critical'\n", " elif 'impact-maintenance' in classes:\n", " return 'maintenance'\n", " else:\n", " raise Exception('Unknown impact class: ' + str(incident_title))\n", " \n", "def get_maintenance_incident_details(incident_update):\n", " raise Exception('Unimplemented. Implement when we encounter one')\n", " \n", "def get_normal_incident_details(incident_updates, year):\n", " try:\n", " problem = False\n", " investigating_divs = incident_updates.select('.investigating')\n", " if len(investigating_divs) == 1:\n", " incident_start_time_str = investigating_divs[0].select('small')[0].get_text().strip()\n", " incident_start_time = parse_date(incident_start_time_str).replace(year=year).timestamp()\n", " else:\n", " incident_start_time = -1\n", " problem = True\n", "\n", " incident_end_time_str = incident_updates.select('.resolved')[0].select('small')[0].get_text().strip()\n", " incident_end_time = parse_date(incident_end_time_str).replace(year=year).timestamp()\n", "\n", " ret_data = {\n", " 'event_start_time': np.int64(incident_start_time),\n", " 'event_end_time': np.int64(incident_end_time)\n", " }\n", " \n", " if problem:\n", " ret_data['status'] = incident_updates.get_text()\n", " \n", " return ret_data\n", " \n", " except:\n", " print(incident_updates)\n", " raise\n", "\n", "def process_github_page(file_obj, metadata):\n", " status_data = BeautifulSoup(file_obj.read(), 'lxml')\n", " \n", " incident_types = ['impact-minor', 'impact-none', 'impact-major', 'impact-critical', 'impact-maintenance']\n", "\n", " incidents_titles = status_data.findAll(attrs={'class': lambda x: x and 'incident-title' in x and any(incident_type in x for incident_type in incident_types)})\n", " incidents_updates = status_data.select('.updates-container')\n", " if len(incidents_titles) != len(incidents_updates):\n", " raise Exception('Number of titles does not match number of updates containers: ' + metadata['date'])\n", " \n", " extracted_incident_info = []\n", " for i in range(len(incidents_titles)):\n", " incident_type = get_incident_type(incidents_titles[i])\n", " incident_year = get_incident_year(incidents_titles[i])\n", "\n", " incident_details = None\n", " if incident_type == 'maintenance':\n", " incident_details = get_maintenance_incident_details(incidents_updates[i])\n", " else:\n", " incident_details = get_normal_incident_details(incidents_updates[i], incident_year)\n", "\n", " # Using incident field for error if required\n", " if 'status' not in incident_details:\n", " incident_details['status'] = incident_type\n", "\n", " extracted_incident_info.append(incident_details)\n", "\n", " return extracted_incident_info\n", "\n", "status_data = None\n", "with open('github-global-status.html') as f:\n", " status_data = process_github_page(f, {})\n", "status_data" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'event_start_time': 1576795740,\n", " 'event_end_time': 1576797120,\n", " 'status': 'minor'}]" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "status_data = None\n", "with open('github-global-status.html') as f:\n", " status_data = process_github_page(f, {})\n", "status_data" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['', 'github-status', '2020', '11', '20201107', 'zip']" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "re.compile('[./]').split(\"/github-status/2020/11/20201107.zip\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "def extract_individual_reports(file_obj):\n", " html = str(file_obj.read())\n", " if html is None:\n", " raise ValueError(f'ERROR: HTML IS NONE \\n filename {filename} \\t file_obj {file_obj} ')\n", " \n", " # Get list of reasons\n", " matches = re.search('\"outage_subjects\":(.*?)],\"', html)\n", " list_of_reasons = json.loads(matches.group(1) + ']')\n", " map_of_reasons = {}\n", " for reason in list_of_reasons:\n", " map_of_reasons[reason['_id']] = reason['name']\n", "\n", " matches = re.search('\"recentReports\":(.*?)],\"_', html)\n", " reports = []\n", " if matches is not None:\n", " # The data is in embedded javascript. Remove constructors to make it JSON\n", " # We used the closing brackets in the regex and don't capture them, adding them back again\n", " sanitized_match = re.sub(r'new Date\\(\".+?\"\\)', '\"\"', matches.group(1)) + ']'\n", " reports = json.loads(sanitized_match)\n", "\n", " \n", " report_id = []\n", " report_country = []\n", " report_reason = []\n", " for datapoint in reports:\n", " try:\n", " report_id.append(datapoint['_id'])\n", " \n", " if 'country_code' in datapoint:\n", " report_country.append(datapoint['country_code'])\n", " elif 'countryCode' in datapoint:\n", " report_country.append(datapoint['countryCode'])\n", " else:\n", " raise Exception('Country code not found')\n", " \n", " if 'oSubjectId' in datapoint:\n", " report_reason.append(map_of_reasons[datapoint['oSubjectId']])\n", " else:\n", " reason_found = False\n", " for name, value in datapoint.items():\n", " if 'subject' in name or 'Subject' in name:\n", " report_reason.append(map_of_reasons[datapoint[name]])\n", " reason_found = True\n", " if not reason_found:\n", " report_reason.append('Unknown')\n", " except:\n", " print(datapoint)\n", " raise\n", "\n", " return pd.DataFrame({\n", " 'id': report_id,\n", " 'country':report_country,\n", " 'reason': report_reason\n", " })" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcountryreason
05fc3c224dcd8580a7973cd8cRUCan't login
15fc3c11dfb995a344b9b68efTRNewsfeed
25fc3bef2fb995a344b9b68edKZMobile app crashes
35fc3bb19ae91e3734cd1777bUSCan't upload pics
45fc3b738dcd8580a7973cd80GRWebsite down
55fc3b66bd4d3565ae19b14b5RUMobile app not working
\n", "
" ], "text/plain": [ " id country reason\n", "0 5fc3c224dcd8580a7973cd8c RU Can't login\n", "1 5fc3c11dfb995a344b9b68ef TR Newsfeed\n", "2 5fc3bef2fb995a344b9b68ed KZ Mobile app crashes\n", "3 5fc3bb19ae91e3734cd1777b US Can't upload pics\n", "4 5fc3b738dcd8580a7973cd80 GR Website down\n", "5 5fc3b66bd4d3565ae19b14b5 RU Mobile app not working" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "status_data = None\n", "with open('outage_report_instagram.html') as f:\n", " status_data = extract_individual_reports(f)\n", "status_data" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'_id': '5a8cb07dee5285d43965f834', 'name': 'Website down'},\n", " {'_id': '5a8cb06fee5285403b65f834', 'name': 'Mobile app not working'},\n", " {'_id': '5a8cb06fee5285403b65f835', 'name': 'Mobile app crashes'},\n", " {'_id': '5a8d3803ee52858b2065f837', 'name': 'Message read problems'},\n", " {'_id': '5a8d3803ee52858b2065f838', 'name': 'Message send problems'},\n", " {'_id': '5a8f0b78ee5285e04e16fca8', 'name': \"Can't login\"},\n", " {'_id': '5a9bdbea1b4314600050c082', 'name': 'Everything is down'},\n", " {'_id': '5a8f18e4ee5285ad7116fca7', 'name': \"Can't upload pics\"},\n", " {'_id': '5a912a4eee5285b07a16fcad', 'name': 'Hashtags not working'},\n", " {'_id': '5a91ab48ee5285e14216fca8', 'name': 'Newsfeed'}]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reports_obj" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'_id': '5fc3c224dcd8580a7973cd8c',\n", " 'serviceId': '5458ac234488de92aa9cda3a',\n", " 'ip': '178.67.193.121',\n", " 'oSubjectId': '5a8f0b78ee5285e04e16fca8',\n", " 'isAmp': True,\n", " 'uaHash': '38c958b1452dc5eda97de8e89acdfcba',\n", " 'langCode': 'ru',\n", " 'latitude': 68.9792,\n", " 'longitude': 33.0925,\n", " 'cityGeonameId': 524305,\n", " 'cityName': 'Murmansk',\n", " 'divisionCode': '49',\n", " 'divisionGeonameId': 524304,\n", " 'countryCode': 'RU',\n", " 'countryGeonameId': 2017370,\n", " 'continentCode': 'EU',\n", " 'continentGeonameId': 6255148,\n", " 'postalCode': '183006',\n", " 'time': '',\n", " '__v': 0,\n", " 'countryGeoname': {'_id': '5c883341232cdab94da82765',\n", " 'geoname_id': 2017370,\n", " 'name': 'Russian Federation',\n", " 'latitude': 60,\n", " 'longitude': 100,\n", " 'feature_code': 'PCLI',\n", " 'country_code': 'RU',\n", " 'admin1_code': '00',\n", " 'center': [60, 100],\n", " 'alt_names': {'de': {'_id': '5c848dee3e5a333eadaa4547',\n", " 'alt_geoname_id': 2419059,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'de',\n", " 'is_preferred_name': True,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Russland'},\n", " 'en': {'_id': '5c848dee3e5a333eadaa450c',\n", " 'alt_geoname_id': 1556487,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'en',\n", " 'is_preferred_name': True,\n", " 'is_short_name': True,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Russia'},\n", " 'es': {'_id': '5c848dee3e5a333eadaa450e',\n", " 'alt_geoname_id': 1556489,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'es',\n", " 'is_preferred_name': True,\n", " 'is_short_name': True,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Rusia'},\n", " 'fr': {'_id': '5c848dee3e5a333eadaa4514',\n", " 'alt_geoname_id': 1556495,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'fr',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Russie'},\n", " 'it': {'_id': '5c848dee3e5a333eadaa454c',\n", " 'alt_geoname_id': 2419064,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'it',\n", " 'is_preferred_name': True,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Russia'},\n", " 'nl': {'_id': '5c848dee3e5a333eadaa454f',\n", " 'alt_geoname_id': 2419067,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'nl',\n", " 'is_preferred_name': True,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Rusland'},\n", " 'pt': {'_id': '5c848dee3e5a333eadaa452e',\n", " 'alt_geoname_id': 1556521,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'pt',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Rússia'},\n", " 'ru': {'_id': '5c848dee3e5a333eadaa4530',\n", " 'alt_geoname_id': 1556523,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'ru',\n", " 'is_preferred_name': True,\n", " 'is_short_name': True,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Россия'},\n", " 'uk': {'_id': '5c848dee3e5a333eadaa4556',\n", " 'alt_geoname_id': 2419074,\n", " 'geoname_id': 2017370,\n", " 'lang_code': 'uk',\n", " 'is_preferred_name': True,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Російська Федерація'}},\n", " 'id': '5c883341232cdab94da82765'},\n", " 'divisionGeoname': {'_id': '5c88333b232cdab94da5fb00',\n", " 'geoname_id': 524304,\n", " 'name': 'Murmanskaya Oblast’',\n", " 'latitude': 68,\n", " 'longitude': 34,\n", " 'feature_code': 'ADM1',\n", " 'country_code': 'RU',\n", " 'admin1_code': '49',\n", " 'center': [68, 34],\n", " 'alt_names': {'de': {'_id': '5c848bae3e5a333eada3646d',\n", " 'alt_geoname_id': 13287290,\n", " 'geoname_id': 524304,\n", " 'lang_code': 'de',\n", " 'is_preferred_name': False,\n", " 'is_short_name': True,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Murmansk'},\n", " 'en': {'_id': '5c848bae3e5a333eada36468',\n", " 'alt_geoname_id': 5463404,\n", " 'geoname_id': 524304,\n", " 'lang_code': 'en',\n", " 'is_preferred_name': False,\n", " 'is_short_name': True,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Murmansk'},\n", " 'fr': {'_id': '5c848bae3e5a333eada3646b',\n", " 'alt_geoname_id': 11761698,\n", " 'geoname_id': 524304,\n", " 'lang_code': 'fr',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Oblast de Mourmansk'},\n", " 'it': {'_id': '5c848bae3e5a333eada36452',\n", " 'alt_geoname_id': 1986257,\n", " 'geoname_id': 524304,\n", " 'lang_code': 'it',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': \"Oblast' di Murmansk\"},\n", " 'nl': {'_id': '5c848bae3e5a333eada36455',\n", " 'alt_geoname_id': 1986260,\n", " 'geoname_id': 524304,\n", " 'lang_code': 'nl',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Oblast Moermansk'},\n", " 'pt': {'_id': '5c848bae3e5a333eada36459',\n", " 'alt_geoname_id': 1986264,\n", " 'geoname_id': 524304,\n", " 'lang_code': 'pt',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Oblast de Murmansk'},\n", " 'ru': {'_id': '5c848bae3e5a333eada3645c',\n", " 'alt_geoname_id': 1986267,\n", " 'geoname_id': 524304,\n", " 'lang_code': 'ru',\n", " 'is_preferred_name': True,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Мурманская область'}},\n", " 'id': '5c88333b232cdab94da5fb00'},\n", " 'cityGeoname': {'_id': '5c88333b232cdab94da5fb01',\n", " 'geoname_id': 524305,\n", " 'name': 'Murmansk',\n", " 'latitude': 68.97917,\n", " 'longitude': 33.09251,\n", " 'feature_code': 'PPLA',\n", " 'country_code': 'RU',\n", " 'admin1_code': '49',\n", " 'center': [68.97917, 33.09251],\n", " 'alt_names': {'de': {'_id': '5c848bae3e5a333eada36470',\n", " 'alt_geoname_id': 1603034,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'de',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Murmansk'},\n", " 'en': {'_id': '5c848bae3e5a333eada364a6',\n", " 'alt_geoname_id': 13358809,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'en',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': True,\n", " '__v': 0,\n", " 'name': 'Romanov'},\n", " 'es': {'_id': '5c848bae3e5a333eada36472',\n", " 'alt_geoname_id': 1603036,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'es',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Múrmansk'},\n", " 'fr': {'_id': '5c848bae3e5a333eada36477',\n", " 'alt_geoname_id': 1603041,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'fr',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Mourmansk'},\n", " 'it': {'_id': '5c848bae3e5a333eada36479',\n", " 'alt_geoname_id': 1603043,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'it',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Murmansk'},\n", " 'nl': {'_id': '5c848bae3e5a333eada3647c',\n", " 'alt_geoname_id': 1603046,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'nl',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Moermansk'},\n", " 'pt': {'_id': '5c848bae3e5a333eada3648b',\n", " 'alt_geoname_id': 1898338,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'pt',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Murmansk'},\n", " 'ru': {'_id': '5c848bae3e5a333eada36491',\n", " 'alt_geoname_id': 6004150,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'ru',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Мурманск'},\n", " 'uk': {'_id': '5c848bae3e5a333eada3649d',\n", " 'alt_geoname_id': 8191448,\n", " 'geoname_id': 524305,\n", " 'lang_code': 'uk',\n", " 'is_preferred_name': False,\n", " 'is_short_name': False,\n", " 'is_colloquial': False,\n", " 'is_historic': False,\n", " '__v': 0,\n", " 'name': 'Мурманськ'}},\n", " 'id': '5c88333b232cdab94da5fb01'},\n", " 'id': '5fc3c224dcd8580a7973cd8c'}" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reports_obj[0]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }