抓取gitlab所有项目地址
抓取gitlab所有项目地址
本文主要介绍通过使用java语言及html爬虫工具包jsoup及excel工具包easyexcel抓取gitlab的项目地址输出到excel中。
首先前提条件是要有管理员权限能查看所有项目的权限,进入所有项目页面,url为:https://gitlab.xxx.com/admin/projects,如下图
浏览器打开地址后分析html内容,主要获取项目的名称、项目url、项目备注,最后导出到excel中。新建springboot项目,pom配置如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.7.18</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.moreair</groupId>
<artifactId>demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>demo</name>
<description>Demo project for Spring Boot</description>
<dependencies>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>3.1.1</version> <!-- 请检查最新版本 -->
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version> <!-- 请检查最新版本 -->
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
首先需要登录gitlab在浏览器按F12查看网络,随便找一个连接获取cookie值,后面jscup使用就可以跳过登录,如下图
接下来通过jscup抓取html,connect方法用于连接要访问的地址,headers设置我们上面获取到的cookie值,免登录。然后通过jscup的各种方法获取html内容,document存的是html内容,通过select、attr等方法可以获取html里面的不同元素,jscup的具体使用可以查看网上的其他文章。最后使用easyexcel将抓取的数据输出到excel。注意写文章的时候我发现备注可以在/admin/projects页面获取,如文章开篇图分析,因为代码是先写的所以会有所不同,但结果一致,思路也一样。代码如下:
package com.moreair;
import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.annotation.ExcelProperty;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import javax.annotation.PostConstruct;
import java.io.IOException;
import java.util.*;
@Slf4j
@Component
public class DownloadService {
@Value("${outputFilePath}")
private String outputFilePath;
@Value("${cookie}")
private String cookie;
@Value("${baseUrl}")
private String baseUrl;
@PostConstruct
public void core() throws Exception {
List<ColumnContent> data = new ArrayList<>();
for (int i = 0; i < 16; i++) {
getUrl(i,data);
}
outputFilePath=outputFilePath+"\\"+System.currentTimeMillis()+".xlsx";
EasyExcel.write(outputFilePath, ColumnContent.class).sheet("代码信息").doWrite(data);
log.info("Excel 文件已导出到: " + outputFilePath);
}
private void getUrl(int page,List<ColumnContent> data) throws IOException {
Connection connect = Jsoup.connect(baseUrl+"/admin/projects?page=" + page + "&sort=latest_activity_desc");
Map headerMap = new HashMap();
headerMap.put("cookie", cookie);
connect.headers(headerMap);
Document document = connect.get();
Elements projectRows = document.select(".title");
log.info("当前数量={}",projectRows.size());
for (Element projectRow : projectRows) {
try {
Element aTag = projectRow.selectFirst("a");
String href = aTag.attr("href").replace("/admin/projects", "");
String projectName = getProjectName(aTag, href);
String codeUrl=baseUrl + href;
String content = getDescription(headerMap, codeUrl);
if(href!=null&&href.length()>0){
data.add(new ColumnContent(projectName,codeUrl, content));
}
} catch (Exception e) {
log.error("获取异常",e);
}
}
}
private String getDescription(Map headerMap, String codeUrl) throws IOException {
Connection connectDesc = Jsoup.connect(codeUrl);
connectDesc.headers(headerMap);
Document document1 = connectDesc.get();
Element metaDescription = document1.selectFirst("meta[name=description]");
String content = null;
if (metaDescription != null) {
content = metaDescription.attr("content");
log.info("Description: " + content);
} else {
log.info("Meta description not found.");
}
return content;
}
private String getProjectName(Element a, String href) {
Element projectNameSpan = a.selectFirst("span.project-name");
String projectName = null;
if (projectNameSpan != null) {
// 获取project-name的文本
projectName = projectNameSpan.text();
log.info("Project Name: " + projectName + " href:" + href);
} else {
log.info("Project name not found.");
}
return projectName;
}
@Data
public static class ColumnContent {
@ExcelProperty("名称")
private String projectName;
@ExcelProperty("代码路径链接")
private String url;
@ExcelProperty("备注")
private String remark;
public ColumnContent(String projectName,String url, String remark) {
this.url = url;
this.projectName = projectName;
this.remark = remark;
}
}
}
本项目难点在于分析html的结构和内容,jscup的使用只要查阅相关使用文档即可。
本项目github地址:https://github.com/MoreAir/get-gitlab-project-info
喜欢的可以关注本人公众号:moreair,定期分享文章!
更多推荐
所有评论(0)