【python爬虫】设计自己的爬虫 4. 封装模拟浏览器 PlaywrightSimulate

【代码】【python爬虫】设计自己的爬虫 4. 封装模拟浏览器 PlaywrightSimulate。

loyd3

484人浏览 · 2024-01-05 10:20:28

loyd3 · 2024-01-05 10:20:28 发布

class PlaywrightSimulate(BrowserSimulateBase):
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None

    def start_browser(self, is_headless: bool = False, playwright_options: Optional[dict] = None, *args,
                      **kwargs) -> Any:
        """
        启动浏览器。

        Args:
            is_headless (bool, optional): 是否启动无头模式。默认为False。
            playwright_options (dict, optional): Playwright启动选项。默认为None。
            *args: 可变位置参数。
            **kwargs: 可变关键字参数。

        Returns:
            Any: 启动的浏览器实例。
        """
        if playwright_options is None:
            playwright_options = {}

        try:
            # 启动浏览器
            self.browser = p.chromium.launch(headless=is_headless, **playwright_options)

            # 创建新的上下文
            self.context = self.browser.new_context()

            return self.browser
        except Exception as e:
            print(f"启动浏览器时发生错误：{e}")
            return None

    def start_page(self, url: str):
        """
        创建新页面并跳转到指定的URL。

        Args:
            url (str): 要跳转的URL。

        Returns:
            Page: 新页面的实例。
        """
        try:
            # 创建新页面
            self.page = self.context.new_page()

            # 跳转到指定的URL
            self.page.goto(url)

            return self.page
        except Exception as e:
            print(f"启动页面时发生错误：{e}")
            return None

    def wait_until_element(self, selector: str, timeout: Optional[int] = None, selector_type=None):
        """
        等待特定元素出现在页面上。

        Args:
            selector (str): 元素的选择器。
            timeout (int, optional): 最大等待时间，以毫秒为单位。默认为None，使用Playwright的默认超时时间。

            selector_type: 保持和基类相同 无须指定（待优化）

        Raises:
            TimeoutError: 如果超时未找到指定的元素。
        """
        try:
            # 使用Playwright的wait_for_selector等待特定元素出现
            self.page.wait_for_selector(selector, timeout=(timeout * 1000))
        except Exception as e:
            raise TimeoutError(f"等待元素 {selector} 出现时发生超时错误：{e}")

    def wait_for_timeout(self, timeout: int):
        """
        等待指定的时间（毫秒）。

        Args:
            timeout (int): 等待的时间，以毫秒为单位。
        """
        try:
            # 使用 Playwright 的 wait_for_timeout 等待指定的时间
            self.page.wait_for_timeout(timeout)
        except Exception as e:
            print(f"等待时间时发生错误：{e}")

    def find_elements(self, selector: str, selector_type=None) -> List[ElementHandle]:
        """
        查找匹配指定选择器的所有元素。

        Args:
            selector (str): 元素的选择器。
            selector_type (str): 选择器类型。
        Returns:
            List[ElementHandle]: 匹配的所有元素的列表。
        """
        try:
            # 使用 Playwright 的 query_selector_all 查找匹配的所有元素
            elements = self.page.query_selector_all(selector)
            return elements
        except Exception as e:
            print(f"查找元素时发生错误：{e}")
            return []

    def find_element(self, selector: str, selector_type=None) -> Optional[ElementHandle]:
        """
        查找匹配指定选择器的第一个元素。

        Args:
            selector (str): 元素的选择器。
            selector_type (str): 选择器类型。

        Returns:
            Optional[ElementHandle]: 匹配的第一个元素的 ElementHandle 对象，如果找不到则返回 None。
        """
        try:
            # 使用 Playwright 的 query_selector 查找匹配的第一个元素
            element = self.page.query_selector(selector)
            return element
        except Exception as e:
            print(f"查找元素时发生错误：{e}")
            return None

    def find_iframe_elements(self, selector: str, iframe: Frame) -> list[ElementHandle]:
        """
        在指定的 iframe 中查找匹配选择器的所有元素。

        Args:
            selector (str): 元素的选择器。
            iframe (Frame): 要在其中查找元素的 iframe。

        Returns:
            list[ElementHandle]: 匹配的所有元素的 ElementHandle 对象列表。
        """
        try:
            # 使用指定的 iframe 的 query_selector_all 方法查找匹配的所有元素
            elements = iframe.query_selector_all(selector)
            return elements
        except Exception as e:
            print(f"在 iframe 中查找元素时发生错误：{e}")
            return []

    def find_iframe_element(self, selector: str, iframe: Frame) -> ElementHandle | None:
        """
        在指定的 iframe 中查找匹配选择器的第一个元素。

        Args:
            selector (str): 元素的选择器。
            iframe (Frame): 要在其中查找元素的 iframe。

        Returns:
            ElementHandle: 匹配的第一个元素的 ElementHandle 对象，如果找不到则返回 None。
        """
        try:
            # 使用指定的 iframe 的 query_selector 方法查找匹配的第一个元素
            element = iframe.query_selector(selector)
            return element
        except Exception as e:
            print(f"在 iframe 中查找元素时发生错误：{e}")
            return None

    def send_keys(self, selector: str, input_content: str, selector_type=None):
        """
        向指定的元素（通常是输入框）输入文本内容。

        Args:
            selector (str): 元素的选择器。
            input_content (str): 要输入的文本内容。

            selector_type 保持和基类相同 无须指定（待优化）
        """
        try:
            # 使用 Playwright 的 fill 函数向指定的元素输入文本内容
            self.page.fill(selector, input_content)
        except Exception as e:
            print(f"输入文本时发生错误：{e}")

    def execute_script(self, script_command: str):
        """
        执行指定的 JavaScript 脚本。

        Args:
            script_command (str): 要执行的 JavaScript 脚本命令。
        """
        try:
            # 使用 Playwright 的 evaluate 函数执行指定的 JavaScript 脚本
            self.page.evaluate(script_command)
        except Exception as e:
            print(f"执行 JavaScript 脚本时发生错误：{e}")

    def go_back(self):
        """
        在浏览器中返回到上一页。
        """
        try:
            # 使用 Playwright 的 go_back() 函数在浏览器中返回到上一页
            self.page.go_back()
        except Exception as e:
            print(f"返回上一页时发生错误：{e}")

    def go_forward(self):
        """
        在浏览器中前进到下一页。
        """
        try:
            # 使用 Playwright 的 go_forward() 函数在浏览器中前进到下一页
            self.page.go_forward()
        except Exception as e:
            print(f"前进到下一页时发生错误：{e}")

    def get_cookies(self):
        """
        获取当前页面的 Cookies。

        Returns:
            list[dict]: 当前页面的 Cookies 列表。
        """
        try:
            # 使用 Playwright 的 context.cookies() 函数获取当前页面的 Cookies
            return self.context.cookies()
        except Exception as e:
            print(f"获取 Cookies 时发生错误：{e}")
            return []

    def add_cookie(self, cookie: dict):
        """
        添加 Cookies 到当前页面。

        Args:
            cookie (dict): 要添加的 Cookies。
        """
        try:
            # 使用 Playwright 的 context.add_cookies() 函数添加 Cookies 到当前页面
            self.context.add_cookies(cookie)
        except Exception as e:
            print(f"添加 Cookies 时发生错误：{e}")

    def del_cookies(self):
        """
        删除当前页面的所有 Cookies。
        """
        try:
            # 使用 Playwright 的 context.clear_cookies() 函数删除当前页面的所有 Cookies
            self.context.clear_cookies()
        except Exception as e:
            print(f"删除 Cookies 时发生错误：{e}")

    def switch_tab(self, tab_index: int):
        """
        切换到指定索引的选项卡。

        Args:
            tab_index (int): 选项卡的索引。
        """
        try:
            # 使用 Playwright 的 context.pages 获取所有选项卡
            pages = self.context.pages
            # 切换到指定索引的选项卡
            pages[tab_index].bring_to_front()
        except Exception as e:
            print(f"切换选项卡时发生错误：{e}")

    def reload_page(self):
        """
        刷新当前页面。
        """
        try:
            # 使用 Playwright 的 page.reload() 函数刷新当前页面
            self.page.reload()
        except Exception as e:
            print(f"刷新页面时发生错误：{e}")

    def screen_page(self, file_path: str = None):
        """
        对当前页面进行截图。

        Args:
            file_path (str, optional): 截图保存的文件路径。默认为 None。

        Returns:
            None
        """
        try:
            # 使用 Playwright 的 page.screenshot() 函数对当前页面进行截图
            self.page.screenshot(path=file_path)
        except Exception as e:
            print(f"截图时发生错误：{e}")

    def close_browser(self):
        """
        关闭浏览器。
        """
        try:
            # 使用 Playwright 的 browser.close() 函数关闭浏览器
            self.browser.close()
        except Exception as e:
            print(f"关闭浏览器时发生错误：{e}")

    def get_content(self) -> str:
        """
        获取当前页面的内容。

        Returns:
            str: 当前页面的内容。
        """
        try:
            # 使用 Playwright 的 page.content() 函数获取当前页面的内容
            return self.page.content()
        except Exception as e:
            print(f"获取页面内容时发生错误：{e}")
            return ""

    def click(self, selector: str):
        """
        点击指定选择器的元素。

        Args:
            selector (str): 要点击的元素的选择器。
        """
        try:
            # 使用 Playwright 的 page.click() 函数点击指定选择器的元素
            self.page.click(selector)
        except Exception as e:
            print(f"点击元素时发生错误：{e}")

    def drag_and_drop(self, source_element: ElementHandle, target_element: ElementHandle):
        """
        模拟拖拽操作。

        Args:
            source_element (ElementHandle): 要拖拽的元素。
            target_element (ElementHandle): 拖拽操作的目标元素。
        """
        try:
            # 获取拖拽元素的坐标
            source_box = source_element.bounding_box()
            target_box = target_element.bounding_box()

            # 计算源和目标元素的中心点
            source_x = source_box['x'] + source_box['width'] / 2
            source_y = source_box['y'] + source_box['height'] / 2
            target_x = target_box['x'] + target_box['width'] / 2
            target_y = target_box['y'] + target_box['height'] / 2

            # 模拟鼠标按下
            self.page.mouse.move(source_x, source_y)
            self.page.mouse.down()
            # 模拟鼠标移动到结束点，实现拖拽操作
            self.page.mouse.move(target_x, target_y)
            # 模拟鼠标释放
            self.page.mouse.up()
        except Exception as e:
            print(f"拖拽操作时发生错误：{e}")

    def to_iframe(self, iframe_name: str) -> Frame | None:
        """
        切换到指定名称的 iframe。

        Args:
            iframe_name (str): 要切换的 iframe 的名称。

        Returns:
            Frame: 切换后的 iframe 对象。
        """
        try:
            # 获取页面中的所有 frames
            frames = self.page.frames
            # 找到指定名称的 iframe
            for frame in frames:
                if frame.name == iframe_name:
                    target_frame = frame
                    return target_frame
            print(f"未找到名称为 {iframe_name} 的 iframe。")
            return None
        except Exception as e:
            print(f"切换到 iframe 时发生错误：{e}")
            return None

    def wait_page_loaded(self):
        """
        等待页面加载完成。
        """
        try:
            # 使用 Playwright 的 page.wait_for_load_state() 函数等待页面加载完成
            self.page.wait_for_load_state('load')
        except Exception as e:
            print(f"等待页面加载完成时发生错误：{e}")

使用例子

if __name__ == '__main__':
    with sync_playwright() as p:
        playwright_simulate = PlaywrightSimulate()
        playwright_simulate.start_browser(playwright=p)

        playwright_simulate.start_page('https://www.baidu.com')
        print(playwright_simulate.get_content())
        playwright_simulate.send_keys('#kw', 'Python')
        playwright_simulate.wait_for_timeout(1)
        playwright_simulate.click('#su')
        playwright_simulate.wait_until_element('#container', 10)
        playwright_simulate.screen_page('../../files/playwright_test.jpg')
        playwright_simulate.close_browser()

        # playwright_simulate.start_page('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
        # target_frame = playwright_simulate.to_iframe('iframeResult')
        # source = playwright_simulate.find_iframe_element('#draggable', target_frame)
        # target = playwright_simulate.find_iframe_element('#droppable', target_frame)
        
        # playwright_simulate.drag_and_drop(source, target)
        # playwright_simulate.close_browser()

        # playwright_simulate.start_page('https://www.baidu.com')
        # playwright_simulate.start_page('https://www.baidu.com')
        
        # playwright_simulate.switch_tab(0)
        # playwright_simulate.wait_time(1)

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

计算机网络微课堂笔记

腾讯云开发者社区

Rabbitmq在java中的使用

腾讯云开发者社区

java try catch 之后定位不到具体报错行_JAVA入门（三）上

点击蓝字｜关注我们一、异常与异常处理异常简介代码中：阻止当前方法或作用域继续实现的，称之为异常java中的所有异常类都继承Throwable类，Exception 的父类是 Throwable编码环境用户操作输入出现问题由java虚拟机自动抛出和自动捕获需要手动添加抛出和捕获语句文件找不到ThrowableErrorException虚拟机错误 VirtualMachineError...