本文通过Java+Playwright代码示例演示网页信息爬取基础方法,以小红书为例讲解页面元素定位技巧,仅供技术学习参考,请严格遵循网站Robots协议及法律法规,禁止将数据用于商业传播、非法牟利等侵权行为。

主代码-爬虫小红书,实现各种方式获取元素、获取文章全部图片、跳转作者主页等功能

package org.example.demo.crawler.xiaohongshu;

import com.microsoft.playwright.*;
import com.microsoft.playwright.options.LoadState;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpMethod;
import org.springframework.http.ResponseEntity;
import org.springframework.web.client.RestTemplate;

import java.util.List;
import java.util.Objects;

public class XiaohongshuCrawler {

    public static void main(String[] args) {
        // 短连接需要先获取重定向跳转地址
        String url = "http://xhslink.com/a/jhhpSVp0iGD8";
        String location = getLocation(url);

        xiaohongshuCrawler(location);
    }

    private static void xiaohongshuCrawler(String url) {
        Browser.NewContextOptions contextOptions = new Browser.NewContextOptions();
        String userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0";
        contextOptions.setUserAgent(userAgent);
        contextOptions.setViewportSize(500, 800);// 设置窗口宽高

        // 设置有头模式-false 方式查看调试,无头模式-true 节省资源
        BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions().setHeadless(true);

        try (Playwright playwright = Playwright.create();
             Browser browser = playwright.chromium().launch(launchOptions);
             BrowserContext context = browser.newContext(contextOptions);
        ) {
            System.out.println("小红书爬虫开始 " + url);
            Page page = context.newPage();
            page.navigate(url, new Page.NavigateOptions().setTimeout(10000));
            page.waitForLoadState(LoadState.NETWORKIDLE);

            System.out.println("标题 " + page.title());
            System.out.println("当前链接 " + page.url());

            // 根据class获取元素
            Locator username = page.locator(".username").first();
            System.out.println("用户名 " + username.textContent());

            // 根据class获取元素列表
            List<Locator> locatorList = page.locator(".note-slider-img").all();
            System.out.println("获取到图片数量 " + locatorList.size());
            for (Locator locator : locatorList) {
                // 获取图片链接
                String img = locator.getAttribute("src");
                System.out.println(img);
            }

            //先使用class精确到某一块元素,后续在这块元素中获取元素,避免根据条件在整个页面获取到多个元素
            Locator author = page.locator(".note-container").locator(".author");
            System.out.println("author size " + author.count());

            // 根据文本内容获取元素
            Locator follow = author.locator("//span[text()='关注']");
            System.out.println("关注按钮数量 " + follow.count());

            // 获取class仅包含某个的元素
            Locator cssContainerElements = page.locator("[class='login-container']");
            System.out.println("cssContainerElements size " + cssContainerElements.count());
            if (cssContainerElements.count() > 0) {
                System.out.println("关闭登录框");
                Locator close = cssContainerElements.first().locator(".button.close");
                close.click();
            }

            // 获取到头像并点击(这里点击后会跳转新的页面)
            Locator avatarItem = author.locator(".avatar-item");
            System.out.println("用户头像数量 " + avatarItem.count());

            //点击:点击前设置等待页面
            Page newPage = context.waitForPage(avatarItem::click);
            newPage.waitForLoadState(LoadState.NETWORKIDLE);
            System.out.println("新页面标题 " + newPage.title());
            System.out.println("新页面地址 " + newPage.url());

            // 最后关闭页面
            page.close();
            newPage.close();
        }

    }

    public static String getLocation(String url) {
        RestTemplate restTemplate = new RestTemplate();
        ResponseEntity<String> response = restTemplate.exchange(url, HttpMethod.GET, null, String.class);
        HttpHeaders responseHeaders = response.getHeaders();
        return Objects.requireNonNull(responseHeaders.getLocation()).toString();
    }

}

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐