java中使用playwright爬虫示例
本文通过Java+Playwright代码示例演示网页信息爬取基础方法,以小红书为例讲解页面元素定位技巧,仅供技术学习参考,请严格遵循网站Robots协议及法律法规,禁止将数据用于商业传播、非法牟利等侵权行为。
·
本文通过Java+Playwright代码示例演示网页信息爬取基础方法,以小红书为例讲解页面元素定位技巧,仅供技术学习参考,请严格遵循网站Robots协议及法律法规,禁止将数据用于商业传播、非法牟利等侵权行为。
主代码-爬虫小红书,实现各种方式获取元素、获取文章全部图片、跳转作者主页等功能
package org.example.demo.crawler.xiaohongshu;
import com.microsoft.playwright.*;
import com.microsoft.playwright.options.LoadState;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpMethod;
import org.springframework.http.ResponseEntity;
import org.springframework.web.client.RestTemplate;
import java.util.List;
import java.util.Objects;
public class XiaohongshuCrawler {
public static void main(String[] args) {
// 短连接需要先获取重定向跳转地址
String url = "http://xhslink.com/a/jhhpSVp0iGD8";
String location = getLocation(url);
xiaohongshuCrawler(location);
}
private static void xiaohongshuCrawler(String url) {
Browser.NewContextOptions contextOptions = new Browser.NewContextOptions();
String userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0";
contextOptions.setUserAgent(userAgent);
contextOptions.setViewportSize(500, 800);// 设置窗口宽高
// 设置有头模式-false 方式查看调试,无头模式-true 节省资源
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions().setHeadless(true);
try (Playwright playwright = Playwright.create();
Browser browser = playwright.chromium().launch(launchOptions);
BrowserContext context = browser.newContext(contextOptions);
) {
System.out.println("小红书爬虫开始 " + url);
Page page = context.newPage();
page.navigate(url, new Page.NavigateOptions().setTimeout(10000));
page.waitForLoadState(LoadState.NETWORKIDLE);
System.out.println("标题 " + page.title());
System.out.println("当前链接 " + page.url());
// 根据class获取元素
Locator username = page.locator(".username").first();
System.out.println("用户名 " + username.textContent());
// 根据class获取元素列表
List<Locator> locatorList = page.locator(".note-slider-img").all();
System.out.println("获取到图片数量 " + locatorList.size());
for (Locator locator : locatorList) {
// 获取图片链接
String img = locator.getAttribute("src");
System.out.println(img);
}
//先使用class精确到某一块元素,后续在这块元素中获取元素,避免根据条件在整个页面获取到多个元素
Locator author = page.locator(".note-container").locator(".author");
System.out.println("author size " + author.count());
// 根据文本内容获取元素
Locator follow = author.locator("//span[text()='关注']");
System.out.println("关注按钮数量 " + follow.count());
// 获取class仅包含某个的元素
Locator cssContainerElements = page.locator("[class='login-container']");
System.out.println("cssContainerElements size " + cssContainerElements.count());
if (cssContainerElements.count() > 0) {
System.out.println("关闭登录框");
Locator close = cssContainerElements.first().locator(".button.close");
close.click();
}
// 获取到头像并点击(这里点击后会跳转新的页面)
Locator avatarItem = author.locator(".avatar-item");
System.out.println("用户头像数量 " + avatarItem.count());
//点击:点击前设置等待页面
Page newPage = context.waitForPage(avatarItem::click);
newPage.waitForLoadState(LoadState.NETWORKIDLE);
System.out.println("新页面标题 " + newPage.title());
System.out.println("新页面地址 " + newPage.url());
// 最后关闭页面
page.close();
newPage.close();
}
}
public static String getLocation(String url) {
RestTemplate restTemplate = new RestTemplate();
ResponseEntity<String> response = restTemplate.exchange(url, HttpMethod.GET, null, String.class);
HttpHeaders responseHeaders = response.getHeaders();
return Objects.requireNonNull(responseHeaders.getLocation()).toString();
}
}
更多推荐
所有评论(0)