Source code for webtraversallibrary.scraper

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at

#   http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""
Module for scraping a website and saving contents to file.
"""

import logging
import os.path
from datetime import datetime
from pathlib import Path
from time import sleep
from typing import Callable, Dict, List

import bs4
from selenium.common.exceptions import TimeoutException, UnexpectedAlertPresentException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.support.wait import WebDriverWait
from urllib3.util import parse_url

from .config import Config
from .javascript import JavascriptWrapper
from .processtools import TimeoutContext
from .screenshot import Screenshot
from .snapshot import PageSnapshot
from .version import __version__

logger = logging.getLogger("wtl")


[docs]class Scraper: """ Used to create web page snapshots using a WebDriver instance. """ def __init__( self, driver: WebDriver, config: Config, hide_sticky: bool = True, postload_callbacks: List[Callable] = None ): self.driver = driver self.config = config self.hide_sticky = hide_sticky self.js = JavascriptWrapper(self.driver, config) self.postload_callbacks = postload_callbacks self.device_pixel_ratio = self.js.execute_script("return window.devicePixelRatio;") or 1.0
[docs] def scrape_current_page(self) -> PageSnapshot: """ Scrape the page currently open in the driver, """ attempts = self.config.scraping.attempts if attempts < 1: logger.warning("config.scraping.attempts is set to 0, no snapshot will be made!") snapshot = None while attempts > 0: try: attempts -= 1 snapshot = self._create_snapshot() break except WebDriverException as e: logger.error(e) if attempts <= 0: raise logger.warning("Scraping failed, reloading the page and trying again...") while attempts > 0: sleep(1.0) try: self.refresh() break except WebDriverException as e: attempts -= 1 logger.error(e) if attempts <= 0: raise return snapshot
[docs] def refresh(self): """ Triggers a refresh of the page. Blocks until page is loaded. """ self.driver.refresh() if self.config.scraping.disable_animations: self.js.disable_animations() self.wait_until_loaded()
[docs] def navigate(self, url: str): """ Navigate the scraper's internal webdriver to the URL. Blocks until page is loaded. """ try: if parse_url(url).scheme is None: url = "http://" + url self.driver.get(url) except TimeoutException: raise TimeoutError(f"WebDriver page load timed out on url '{url}'") except WebDriverException: logger.error(f"URL not valid: {url}") if self.config.scraping.disable_animations: self.js.disable_animations() self.wait_until_loaded()
[docs] def wait_until_loaded(self, timeout: int = None): """ Waits on the webdriver instance to finish loading a page before returning. .. note:: Because of the unending imagination of javascript devs, there may be cases where this function returns before the timeout although the page hasn't loaded. """ if timeout is None: timeout = self.config.scraping.page_load_timeout try: # Wait until page is loaded from JS and jQuery perspective WebDriverWait(self.driver, timeout).until(self.js.is_page_loaded) except TimeoutException: logger.warning("Timed out waiting for the page to fully load, proceeding anyway") except UnexpectedAlertPresentException: logger.warning("Javascript alert noted, trying to proceed") sleep(self.config.scraping.wait_loading) # Some pages change after scrolling, so simulate some movement if self.config.scraping.prescroll: self.js.scroll_to(0, 100) sleep(self.config.scraping.wait_scroll) self.js.scroll_to(0, 9999) sleep(self.config.scraping.wait_scroll) self.js.scroll_to(0, 0) sleep(self.config.scraping.wait_scroll)
[docs] def capture_screenshot(self, name: str, max_page_height: int = 0) -> Screenshot: """ Captures the screenshot of the current rendering in the browser window. """ return Screenshot.capture( name=name, driver=self.driver, scale=1 / self.device_pixel_ratio, max_page_height=max_page_height )
[docs] def get_page_as_mhtml(self) -> bytes: """ Gets an MHTML representation of the current page, returns it as a bytestring. MHTML must be enabled in the browser configuration, otherwise returns None. """ if not self.config.browser.enable_mhtml: return None folder = Path(os.path.expanduser(self.config.scraping.temp_path)) os.makedirs(folder, exist_ok=True) filename = "temp_mhtml_file.txt" try: os.remove(folder / filename) except OSError: pass self.js.save_mhtml(filename) try: with TimeoutContext(n_seconds=self.config.scraping.mhtml_timeout): while not os.path.exists(folder / filename): sleep(0.25) except TimeoutError: logger.error("Failed to get MHTML snapshot within desired time limit!") return None with open(folder / filename, "rb") as f: return f.read()
def _create_snapshot(self) -> PageSnapshot: before = datetime.now() logger.debug(f"Page title: {self.driver.title}") # Create screenshots if required screenshots: Dict[str, Screenshot] = {} if self.config.debug.screenshots: max_page_height = self.config.scrolling.max_page_height screenshots["first"] = self.capture_screenshot("first") if max_page_height == 0: screenshots["full"] = screenshots["first"].copy("full") else: screenshots["full"] = self.capture_screenshot("full", max_page_height=max_page_height) # Gather element metadata elements_metadata = self.js.get_element_metadata() or [] num_elements = len(elements_metadata) # Gather page metadata inner_html = self.driver.find_element(By.XPATH, "/html").get_attribute("innerHTML") page_source = bs4.BeautifulSoup(f"<!DOCTYPE html><html>{inner_html}</html>", self.config.bs_html_parser) page_metadata = { "timestamp": before.isoformat(), "url": self.driver.current_url, "title": self.driver.title, "driver": self.driver.name, "full_page_size": (self.config.browser.width, self.js.get_full_height()), "device_pixel_ratio": self.device_pixel_ratio, "num_elements": num_elements, "screenshots": list(screenshots.keys()), "wtl_version": __version__, } milliseconds_passed = (datetime.now() - before).total_seconds() * 1000 if num_elements < 2: logger.info("No elements detected on the page!") else: logger.info(f"Found {num_elements} " f"elements in {milliseconds_passed:.2f}ms") mhtml_source = self.get_page_as_mhtml() if self.config.scraping.save_mhtml else None # Assemble snapshot snapshot = PageSnapshot( page_source=page_source, page_metadata=page_metadata, elements_metadata=elements_metadata, screenshots=screenshots, mhtml_source=mhtml_source, ) return snapshot