爬虫任务

This commit is contained in:
朱春云 2025-09-11 12:04:45 +08:00
parent fe5be1107d
commit 7f57dae35c
4 changed files with 72 additions and 36 deletions

View File

@ -13,7 +13,7 @@ import com.ruoyi.busi.vo.BusiKeywordRankStatVO;
/**
* 关键词排名明细Service接口
*
*
* @author vinjor-m
* @date 2025-08-21
*/
@ -26,7 +26,7 @@ public interface IBusiKeywordItemService extends IService<BusiKeywordItem>
* @author vinjor-M
* @date 15:10 2025/8/21
**/
void getKeywordRanking();
void getKeywordRanking() throws InterruptedException;
/**
* 分页查询关键词排名统计

View File

@ -22,9 +22,12 @@ import com.ruoyi.busi.mapper.BusiKeywordItemMapper;
import com.ruoyi.busi.domain.BusiKeywordItem;
import com.ruoyi.busi.service.IBusiKeywordItemService;
import static com.ruoyi.common.utils.GoogleUtil.formatRank;
import static com.ruoyi.common.utils.GoogleUtil.getGoogleRank;
/**
* 关键词排名明细Service业务层处理
*
*
* @author vinjor-m
* @date 2025-08-21
*/
@ -50,26 +53,40 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl<BusiKeywordItemMappe
* @date 15:10 2025/8/21
**/
@Override
public void getKeywordRanking() {
public void getKeywordRanking() throws InterruptedException {
Date nowDate = new Date();
//所有网站
List<BaseSite> siteList = baseSiteService.list();
Map<String,String> siteMap = siteList.stream().collect(Collectors.toMap(BaseSite::getId, BaseSite::getSiteUrl));
List<BusiKeyword> keywordList = busiKeywordService.list();
List<BusiKeywordItem> insertList = new ArrayList<>();
int i = 0;
for (BusiKeyword keyword : keywordList) {
BusiKeywordItem busiKeywordItem = new BusiKeywordItem();
busiKeywordItem.setTitle(keyword.getId());
busiKeywordItem.setSelectDate(nowDate);
int ranking = GoogleUtil.getGoogleRank(keyword.getId(), siteMap.get(keyword.getTenantId()));
if(ranking>0 && ranking<=20){
System.out.println("" + (i+1) + "次搜索: 关键词=" + keyword.getId() + ", 目标网站=" + siteMap.get(keyword.getTenantId()));
int rank = 0;
try {
// 测试用例搜索关键词查找特定网站的排名
rank = getGoogleRank( keyword.getId(), siteMap.get(keyword.getTenantId()));
System.out.println("" + (i+1) + "次搜索:"+"排名:" + formatRank(rank));
} catch (Exception e) {
System.err.println("搜索过程中发生异常: " + e.getMessage());
}
// 模拟用户行为随机休眠5-15秒
int randomSleep = 10000 + (int) (Math.random() * 20000);
System.out.println("等待 " + (randomSleep / 1000) + " 秒后进行下次搜索...");
Thread.sleep(randomSleep);
if(rank>0 && rank<=20){
//找到今天的排名
busiKeywordItem.setRanking(ranking);
busiKeywordItem.setRanking(rank);
busiKeywordItem.setTenantId(keyword.getTenantId());
insertList.add(busiKeywordItem);
//删除这个关键词今天的排名
busiKeywordItemMapper.deleteBySelectDateInt(DateUtil.formatDate(nowDate),busiKeywordItem.getTitle());
}
i++;
}
if(!insertList.isEmpty()){
this.saveBatch(insertList);

View File

@ -32,7 +32,7 @@ public class BusiTask {
* @author vinjor-M
* @date 14:51 2025/8/21
**/
public void updateKeywordRanking() {
public void updateKeywordRanking() throws InterruptedException {
busiKeywordItemService.getKeywordRanking();
System.out.println(""+ DateUtil.now() +"】执行更新本站使用关键词google排名成功");
}

View File

@ -261,33 +261,25 @@ public class GoogleUtil {
}
ChromeOptions options = new ChromeOptions();
// 按环境变量决定是否 headless默认有头更像真实用户
boolean headless = Boolean.parseBoolean(System.getenv().getOrDefault("GOOGLE_HEADLESS", "false"));
if (headless) options.addArguments("--headless=new");
// // 按环境变量决定是否 headless默认有头更像真实用户
// boolean headless = Boolean.parseBoolean(System.getenv().getOrDefault("GOOGLE_HEADLESS", "false"));
// if (headless) options.addArguments("--headless=new");
// 使用真实的用户代理从现有浏览器中复制
options.addArguments("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36");
// 设置真实的窗口大小
options.addArguments("--window-size=1920,1080");
// 语言与用户体验设置
options.addArguments("--lang=zh-CN");
Map<String, Object> prefs = new HashMap<>();
prefs.put("intl.accept_languages", "zh-CN,zh;q=0.9");
options.setExperimentalOption("prefs", prefs);
// User-Agent 随机化
String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
options.addArguments("--user-agent=" + randomUserAgent);
// 真实窗口大小避免固定指纹
int width = 1024 + (int) (Math.random() * 600);
int height = 700 + (int) (Math.random() * 500);
options.addArguments("--window-size=" + width + "," + height);
// 尽量减少自动化特征
// 禁用自动化控制特征
options.addArguments("--disable-blink-features=AutomationControlled");
options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
options.setExperimentalOption("useAutomationExtension", false);
// 添加更多真实用户参数
options.addArguments("--disable-extensions");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
options.addArguments("--ignore-ssl-errors");
options.addArguments("--ignore-certificate-errors");
options.addArguments("--lang=zh-CN");
options.addArguments("--disable-plugins");
options.addArguments("--disable-images"); // 可选禁用图片加载以提高速度
// 复用用户数据目录可降低验证码概率
String userDataDir = System.getenv("GOOGLE_CHROME_USER_DATA_DIR");
@ -529,15 +521,42 @@ public class GoogleUtil {
}
// 测试方法
public static void main(String[] args) {
for (int i = 0; i < 5; i++) {
// 测试用例搜索关键词查找特定网站的排名
int rank1 = getGoogleRank("大卡车", "zh.wikipedia.org");
System.out.println("维基百科排名:" + formatRank(rank1));
}
// 测试方法
public static void main(String[] args) throws InterruptedException {
// 定义搜索关键词和目标网站列表模拟真实用户的不同搜索行为
String[][] searchTasks = {
{"大卡车", "zh.wikipedia.org"},
{"汽车", "zh.wikipedia.org"},
{"货车", "zh.wikipedia.org"},
{"卡车运输", "zh.wikipedia.org"},
{"交通运输", "zh.wikipedia.org"}
};
for (int i = 0; i < 5; i++) {
// 随机选择一个搜索任务
int randomIndex = (int) (Math.random() * searchTasks.length);
String keyword = searchTasks[randomIndex][0];
String website = searchTasks[randomIndex][1];
System.out.println("" + (i+1) + "次搜索: 关键词=" + keyword + ", 目标网站=" + website);
try {
// 测试用例搜索关键词查找特定网站的排名
int rank = getGoogleRank(keyword, website);
System.out.println("" + (i+1) + "次搜索:"+"排名:" + formatRank(rank));
} catch (Exception e) {
System.err.println("搜索过程中发生异常: " + e.getMessage());
}
// 模拟用户行为随机休眠5-15秒
int randomSleep = 10000 + (int) (Math.random() * 20000);
System.out.println("等待 " + (randomSleep / 1000) + " 秒后进行下次搜索...");
Thread.sleep(randomSleep);
}
}
/**
* 提取并规范化 host
*/
@ -570,7 +589,7 @@ public class GoogleUtil {
// 辅助方法格式化排名输出
private static String formatRank(int rank) {
public static String formatRank(int rank) {
if (rank > 0) return rank + "";
else if (rank == -1) return "未找到";
else if (rank == -2) return "遇到验证码";