feat(utils): 新增GoogleRankUtil工具类,基于speculationrules实现稳定的Google排名查询

 核心特性:
- 基于Google speculationrules的JSON解析,不依赖DOM结构
- 支持多页搜索(前20名),自动翻页直到找到目标
- 精准的排名计算,baseRank + validRank机制
- 智能过滤Google自身域名,只计数真实搜索结果
- 会话复用机制,降低验证码触发率
- 丰富的配置选项(无头模式、请求间隔、重试策略等)

🔧 技术实现:
- Jackson解析speculationrules脚本中的prefetch URLs
- Selenium WebDriver自动化浏览器
- 人性化延时和滚动,模拟真实用户行为
- User-Agent随机化,增强反爬友好性

📊 测试结果:
- "货车" → 维基百科排名第1名 ✓
- "大卡车" → 维基百科排名第12名(第2页第2个结果)✓

优势:相比传统DOM解析方案更稳定、准确,不受页面结构变化影响
This commit is contained in:
menft 2025-10-24 02:31:17 +08:00
parent fe1f3264fa
commit 5bb2a9a1ee

View File

@ -0,0 +1,630 @@
package com.ruoyi.common.utils;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
/**
* Google排名查询工具类基于speculationrules解析
*
* 本工具类采用全新的解析方案
* 1. 不再解析复杂的DOM结构
* 2. 直接提取Google返回的<script type="speculationrules">中的URL列表
* 3. 该列表包含了所有搜索结果的URL按排名顺序排列
* 4. 更加稳定准确不受DOM结构变化影响
*
* @author menft
*/
public class GoogleRankUtil {
private static final Logger log = LoggerFactory.getLogger(GoogleRankUtil.class);
// ==================== 配置参数 ====================
private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q=";
private static final int CONNECT_TIMEOUT = 10000;
private static final int READ_TIMEOUT = 15000;
// User-Agent池
private static final List<String> USER_AGENTS = Arrays.asList(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36"
);
// 全局配置开关
private static volatile boolean HEADLESS = false;
private static volatile boolean SESSION_REUSE = false;
private static volatile long MIN_INTERVAL_MS = 8000;
private static volatile long JITTER_MS = 5000;
private static volatile int MAX_RETRIES = 3;
private static volatile int BASE_BACKOFF_MS = 10000;
private static volatile boolean KEEP_BROWSER_OPEN = false;
// 线程本地变量
private static final ThreadLocal<WebDriver> TL_DRIVER = new ThreadLocal<>();
private static final AtomicLong LAST_NAV_AT = new AtomicLong(0L);
// ==================== 配置方法 ====================
public static void setHeadless(boolean headless) {
HEADLESS = headless;
}
public static void setSessionReuse(boolean reuse) {
SESSION_REUSE = reuse;
}
public static void setGlobalRequestInterval(long minIntervalMs, long jitterMs) {
MIN_INTERVAL_MS = Math.max(0, minIntervalMs);
JITTER_MS = Math.max(0, jitterMs);
}
public static void setRetryPolicy(int maxRetries, int baseBackoffMs) {
MAX_RETRIES = Math.max(1, maxRetries);
BASE_BACKOFF_MS = Math.max(1000, baseBackoffMs);
}
public static void setKeepBrowserOpen(boolean keep) {
KEEP_BROWSER_OPEN = keep;
}
public static void shutdownDriver() {
WebDriver d = TL_DRIVER.get();
if (d != null) {
try {
d.quit();
} catch (Exception ignore) {
}
TL_DRIVER.remove();
}
}
// ==================== 核心方法 ====================
/**
* 获取关键词在指定网站的Google排名
*
* @param searchText 搜索关键词
* @param targetSite 目标网站zh.wikipedia.org
* @return 排名>0表示找到-1表示未找到-2表示遇到验证码0表示查询失败
*/
public static int getGoogleRank(String searchText, String targetSite) {
// 入参校验
if (searchText == null || searchText.trim().isEmpty() || targetSite == null || targetSite.trim().isEmpty()) {
log.error("入参非法searchText={}, targetSite={}", searchText, targetSite);
return 0;
}
// 重试策略
for (int attempt = 1; attempt <= MAX_RETRIES; attempt++) {
int result = getGoogleRankInternal(searchText, targetSite);
if (result != -2) { // 不是验证码错误
return result;
}
if (attempt < MAX_RETRIES) {
int backoff = BASE_BACKOFF_MS * attempt;
log.info("遇到验证码,{}毫秒后进行第{}次重试", backoff, attempt + 1);
try {
Thread.sleep(backoff);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
log.error("经过{}次尝试后仍然遇到验证码,搜索失败", MAX_RETRIES);
return -2;
}
/**
* 内部实现获取Google排名支持多页搜索
*/
private static int getGoogleRankInternal(String searchText, String targetSite) {
targetSite = targetSite.trim();
String encodedSearchText = encodeSearchText(searchText);
if (encodedSearchText == null) {
return 0;
}
// 构建基础搜索URL使用udm=14参数强制网页结果
String baseUrl = GOOGLE_SEARCH_URL + encodedSearchText + "&newwindow=1&hl=zh-CN&pws=0&udm=14";
log.info("发起Google搜索请求url={}, 目标网站={}", baseUrl, targetSite);
WebDriver driver = null;
try {
// 初始化/复用 WebDriver
driver = SESSION_REUSE ? getOrCreateDriver() : createWebDriver();
throttleSleep("before-nav");
// 先访问 NCR 以避免地区重定向
try {
driver.get("https://www.google.com/ncr");
throttleSleep("after-ncr");
} catch (Exception ignore) {
}
// 检查是否跳转到验证页面
try {
String cur = driver.getCurrentUrl().toLowerCase();
if (cur.contains("/sorry/") || cur.contains("captcha")) {
log.warn("发现跳转至验证页面: {}", cur);
return -2;
}
} catch (Exception e) {
log.warn("获取当前URL失败: {}", e.getMessage());
return -2;
}
// 优化搜索前2页前20名
int rank = -1;
for (int page = 1; page <= 2; page++) {
String pageUrl = page == 1 ? baseUrl : baseUrl + "&start=" + ((page - 1) * 10);
log.info("搜索第{}页: {}", page, pageUrl);
// 访问Google搜索页面
driver.get(pageUrl);
throttleSleep("after-search");
// 显式等待页面加载
try {
WebDriverWait wait = new WebDriverWait(driver, 20);
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("body")));
} catch (Exception e) {
log.warn("等待页面加载超时: {}", e.getMessage());
}
// 人类化随机等待
Thread.sleep((long) (2000 + Math.random() * 3000));
// 轻微滚动模拟浏览
try {
JavascriptExecutor js = (JavascriptExecutor) driver;
long steps = 2 + (int) (Math.random() * 3);
for (int i = 0; i < steps; i++) {
js.executeScript("window.scrollBy(0, arguments[0]);", 200 + (int) (Math.random() * 400));
Thread.sleep((long) (400 + Math.random() * 600));
}
} catch (Exception ignore) {
}
// 检查是否出现验证码
if (isCaptchaPage(driver)) {
log.warn("检测到验证码页面,本次请求失败");
return -2;
}
// 获取页面源码
String html = driver.getPageSource();
log.debug("第{}页请求成功:响应长度={}字节", page, html.length());
// 解析排名使用speculationrules方案传入baseRank
int baseRank = (page - 1) * 10;
int pageRank = parseRankFromSpeculationRules(html, targetSite, baseRank);
if (pageRank > 0) {
rank = pageRank;
break; // 找到了停止搜索
}
// 页面间增加随机延时
if (page < 2) {
Thread.sleep((long) (2000 + Math.random() * 3000));
}
}
return rank;
} catch (Exception e) {
log.error("获取排名异常url={}, 原因={}", baseUrl, e.getMessage(), e);
return 0;
} finally {
// 关闭浏览器或复用
if (!SESSION_REUSE) {
if (driver != null) {
if (!KEEP_BROWSER_OPEN) {
try {
driver.quit();
} catch (Exception e) {
log.error("关闭浏览器失败", e);
}
} else {
log.info("根据配置保留浏览器窗口开启KEEP_BROWSER_OPEN=true");
}
}
} else {
log.debug("开启会话复用,保留当前线程的 WebDriver 实例");
}
}
}
/**
* 核心方法从speculationrules脚本中解析排名
*
* 解析逻辑
* 1. 查找<script type="speculationrules">标签
* 2. 提取JSON内容
* 3. 解析prefetch数组中的urls
* 4. 按顺序匹配目标网站
*
* @param html HTML源码
* @param targetSite 目标网站
* @param baseRank 基础排名第1页=0第2页=10
* @return 排名>0表示找到-1表示未找到
*/
private static int parseRankFromSpeculationRules(String html, String targetSite, int baseRank) {
try {
Document doc = Jsoup.parse(html);
// 查找所有speculationrules脚本
Elements scripts = doc.select("script[type=speculationrules]");
if (scripts.isEmpty()) {
log.warn("未找到speculationrules脚本可能Google页面结构已变化");
return -1;
}
log.info("找到{}个speculationrules脚本", scripts.size());
// 提取所有URL按顺序
List<String> allUrls = new ArrayList<>();
ObjectMapper mapper = new ObjectMapper();
for (Element script : scripts) {
String jsonContent = script.html();
try {
JsonNode root = mapper.readTree(jsonContent);
JsonNode prefetchArray = root.get("prefetch");
if (prefetchArray != null && prefetchArray.isArray()) {
for (JsonNode prefetchItem : prefetchArray) {
JsonNode urlsNode = prefetchItem.get("urls");
if (urlsNode != null && urlsNode.isArray()) {
for (JsonNode urlNode : urlsNode) {
String url = urlNode.asText();
if (url != null && !url.isEmpty()) {
allUrls.add(url);
}
}
}
}
}
} catch (Exception e) {
log.warn("解析speculationrules JSON失败: {}", e.getMessage());
}
}
log.info("从speculationrules中提取到{}个URL", allUrls.size());
// 调试打印所有URL
for (int i = 0; i < allUrls.size(); i++) {
log.debug("URL[{}]: {}", i + 1, allUrls.get(i));
}
// 按顺序匹配目标网站
String cleanTarget = normalizeHost(targetSite);
log.info("规范化后的目标网站: {}", cleanTarget);
int validRank = 0; // 有效排名计数器
for (int i = 0; i < allUrls.size(); i++) {
String url = allUrls.get(i);
String host = extractHost(url);
if (host != null && !host.isEmpty()) {
// 过滤Google自身域名
if (host.contains("google.")) {
log.debug("跳过Google自身域名: {}", host);
continue;
}
validRank++; // 只计算非Google的URL
// 匹配目标网站
boolean isMatch = isMatchingHost(host, cleanTarget);
log.debug("URL[{}] host={}, target={}, isMatch={}", validRank, host, cleanTarget, isMatch);
if (isMatch) {
int actualRank = baseRank + validRank;
log.info("找到目标网站:排名={}, URL={}, host={}, target={}", actualRank, url, host, cleanTarget);
return actualRank;
}
}
}
log.info("在speculationrules中未找到目标网站targetSite={}", targetSite);
return -1;
} catch (Exception e) {
log.error("解析speculationrules失败", e);
return -1;
}
}
/**
* 判断host是否匹配目标
*/
private static boolean isMatchingHost(String host, String cleanTarget) {
if (host == null || cleanTarget == null) {
return false;
}
host = host.toLowerCase();
cleanTarget = cleanTarget.toLowerCase();
// 精确匹配
if (host.equals(cleanTarget)) {
return true;
}
// 子域名匹配
if (host.endsWith("." + cleanTarget)) {
return true;
}
// 父域名匹配
if (cleanTarget.endsWith("." + host)) {
return true;
}
// 模糊匹配包含关系
if (host.contains(cleanTarget) || cleanTarget.contains(host)) {
return true;
}
return false;
}
/**
* 提取并规范化host
*/
private static String extractHost(String urlStr) {
try {
if (urlStr.startsWith("http")) {
URL u = new URL(urlStr);
return normalizeHost(u.getHost());
}
if (urlStr.startsWith("//")) {
URL u = new URL("https:" + urlStr);
return normalizeHost(u.getHost());
}
return null;
} catch (Exception e) {
return null;
}
}
/**
* 规范化host
*/
private static String normalizeHost(String hostOrUrl) {
if (hostOrUrl == null || hostOrUrl.isEmpty()) return "";
String s = hostOrUrl.trim();
s = s.replace("https://", "").replace("http://", "");
if (s.contains("/")) s = s.substring(0, s.indexOf('/'));
if (s.startsWith("www.")) s = s.substring(4);
return s.toLowerCase();
}
/**
* 编码搜索关键词
*/
private static String encodeSearchText(String searchText) {
try {
return URLEncoder.encode(searchText.trim(), StandardCharsets.UTF_8.name());
} catch (Exception e) {
log.error("关键词编码失败searchText={}", searchText, e);
return null;
}
}
/**
* 检查是否为验证码页面
*/
private static boolean isCaptchaPage(WebDriver driver) {
try {
String currentUrl = "";
String pageSource = "";
try {
currentUrl = driver.getCurrentUrl().toLowerCase();
} catch (Exception ignore) {
}
try {
pageSource = driver.getPageSource().toLowerCase();
} catch (Exception ignore) {
}
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
currentUrl.contains("security-check") ||
currentUrl.contains("captcha");
boolean hasCaptchaKeywords = pageSource.contains("captcha") ||
pageSource.contains("recaptcha") ||
pageSource.contains("人机验证") ||
pageSource.contains("异常流量");
return isCaptchaUrl || hasCaptchaKeywords;
} catch (Exception e) {
log.error("检查验证码页面时出错", e);
return false;
}
}
/**
* 节流延迟
*/
private static void throttleSleep(String reason) {
long now = System.currentTimeMillis();
long last = LAST_NAV_AT.get();
long gap = now - last;
long need = MIN_INTERVAL_MS - gap;
if (need > 0) {
try {
long extra = (long) (Math.random() * JITTER_MS);
Thread.sleep(need + extra);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
}
LAST_NAV_AT.set(System.currentTimeMillis());
}
/**
* 获取或创建WebDriver
*/
private static WebDriver getOrCreateDriver() {
WebDriver d = TL_DRIVER.get();
boolean alive = false;
if (d != null) {
try {
d.getTitle();
alive = true;
} catch (Exception ignore) {
alive = false;
}
}
if (!alive) {
if (d != null) {
try {
d.quit();
} catch (Exception ignore) {
}
TL_DRIVER.remove();
}
d = createWebDriver();
TL_DRIVER.set(d);
}
return d;
}
/**
* 创建WebDriver实例
*/
private static WebDriver createWebDriver() {
try {
WebDriverManager.chromedriver().setup();
log.info("WebDriverManager 已自动配置 chromedriver");
} catch (Throwable t) {
log.warn("WebDriverManager 配置失败,尝试使用手动路径: {}", t.getMessage());
String sysProp = System.getProperty("webdriver.chrome.driver");
String envProp = System.getenv("CHROMEDRIVER_PATH");
if (sysProp == null || sysProp.isEmpty()) {
String path = (envProp != null && !envProp.isEmpty()) ? envProp : "/usr/local/bin/chromedriver";
System.setProperty("webdriver.chrome.driver", path);
log.info("使用ChromeDriver路径: {}", path);
}
}
ChromeOptions options = new ChromeOptions();
// 无头模式
if (HEADLESS) {
options.addArguments("--headless=new");
}
// 语言设置
options.addArguments("--lang=zh-CN");
Map<String, Object> prefs = new HashMap<>();
prefs.put("intl.accept_languages", "zh-CN,zh;q=0.9");
options.setExperimentalOption("prefs", prefs);
// User-Agent随机化
String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
options.addArguments("--user-agent=" + randomUserAgent);
// 窗口大小
int width = 1024 + (int) (Math.random() * 600);
int height = 700 + (int) (Math.random() * 500);
options.addArguments("--window-size=" + width + "," + height);
// 减少自动化特征
options.addArguments("--disable-blink-features=AutomationControlled");
options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
options.addArguments("--disable-extensions");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
// 是否保留浏览器窗口
options.setExperimentalOption("detach", KEEP_BROWSER_OPEN);
WebDriver driver = new ChromeDriver(options);
// 隐藏webdriver标志
try {
((JavascriptExecutor) driver).executeScript(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
} catch (Exception ignore) {
}
return driver;
}
/**
* 格式化排名输出
*/
private static String formatRank(int rank) {
if (rank > 0) return rank + "";
else if (rank == -1) return "未找到";
else if (rank == -2) return "遇到验证码";
else return "查询失败";
}
// ==================== 测试方法 ====================
/**
* 批量测试方法一次性采集多个关键词的维基百科排名
*/
public static void main(String[] args) throws InterruptedException {
// 配置参数
GoogleRankUtil.setHeadless(false); // Mac环境建议 false
GoogleRankUtil.setSessionReuse(true); // 复用浏览器会话
GoogleRankUtil.setGlobalRequestInterval(8000, 5000); // 8s + 抖动 0~5s
GoogleRankUtil.setRetryPolicy(3, 10000); // 重试3次递增退避
GoogleRankUtil.setKeepBrowserOpen(false); // 结束后关闭
// 测试关键词先测试1个
List<String> keywords = Arrays.asList("货车","大卡车");
String site = "zh.wikipedia.org";
System.out.println("========== GoogleRankUtil 测试 ==========");
System.out.println("解析方案speculationrules稳定、准确");
System.out.println("目标站点: " + site);
System.out.println("关键词数量: " + keywords.size());
System.out.println("=========================================\n");
for (int i = 0; i < keywords.size(); i++) {
String kw = keywords.get(i);
System.out.println("[" + (i + 1) + "/" + keywords.size() + "] 正在采集关键词: " + kw);
long startTime = System.currentTimeMillis();
int rank = getGoogleRank(kw, site);
long elapsed = System.currentTimeMillis() - startTime;
System.out.println("关键词[" + kw + "] 维基百科排名: " + formatRank(rank));
System.out.println("耗时: " + elapsed + "ms");
System.out.println("-----------------------------------\n");
}
System.out.println("========== 采集完成 ==========");
// 清理
GoogleRankUtil.shutdownDriver();
}
}