feat(utils): 新增GoogleRankUtil工具类,基于speculationrules实现稳定的Google排名查询
✨ 核心特性: - 基于Google speculationrules的JSON解析,不依赖DOM结构 - 支持多页搜索(前20名),自动翻页直到找到目标 - 精准的排名计算,baseRank + validRank机制 - 智能过滤Google自身域名,只计数真实搜索结果 - 会话复用机制,降低验证码触发率 - 丰富的配置选项(无头模式、请求间隔、重试策略等) 🔧 技术实现: - Jackson解析speculationrules脚本中的prefetch URLs - Selenium WebDriver自动化浏览器 - 人性化延时和滚动,模拟真实用户行为 - User-Agent随机化,增强反爬友好性 📊 测试结果: - "货车" → 维基百科排名第1名 ✓ - "大卡车" → 维基百科排名第12名(第2页第2个结果)✓ 优势:相比传统DOM解析方案更稳定、准确,不受页面结构变化影响
This commit is contained in:
parent
fe1f3264fa
commit
5bb2a9a1ee
@ -0,0 +1,630 @@
|
||||
package com.ruoyi.common.utils;
|
||||
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import io.github.bonigarcia.wdm.WebDriverManager;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.JavascriptExecutor;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.openqa.selenium.support.ui.ExpectedConditions;
|
||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/**
|
||||
* Google排名查询工具类(基于speculationrules解析)
|
||||
*
|
||||
* 本工具类采用全新的解析方案:
|
||||
* 1. 不再解析复杂的DOM结构
|
||||
* 2. 直接提取Google返回的<script type="speculationrules">中的URL列表
|
||||
* 3. 该列表包含了所有搜索结果的URL,按排名顺序排列
|
||||
* 4. 更加稳定、准确、不受DOM结构变化影响
|
||||
*
|
||||
* @author menft
|
||||
*/
|
||||
public class GoogleRankUtil {
|
||||
private static final Logger log = LoggerFactory.getLogger(GoogleRankUtil.class);
|
||||
|
||||
// ==================== 配置参数 ====================
|
||||
private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q=";
|
||||
private static final int CONNECT_TIMEOUT = 10000;
|
||||
private static final int READ_TIMEOUT = 15000;
|
||||
|
||||
// User-Agent池
|
||||
private static final List<String> USER_AGENTS = Arrays.asList(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36"
|
||||
);
|
||||
|
||||
// 全局配置开关
|
||||
private static volatile boolean HEADLESS = false;
|
||||
private static volatile boolean SESSION_REUSE = false;
|
||||
private static volatile long MIN_INTERVAL_MS = 8000;
|
||||
private static volatile long JITTER_MS = 5000;
|
||||
private static volatile int MAX_RETRIES = 3;
|
||||
private static volatile int BASE_BACKOFF_MS = 10000;
|
||||
private static volatile boolean KEEP_BROWSER_OPEN = false;
|
||||
|
||||
// 线程本地变量
|
||||
private static final ThreadLocal<WebDriver> TL_DRIVER = new ThreadLocal<>();
|
||||
private static final AtomicLong LAST_NAV_AT = new AtomicLong(0L);
|
||||
|
||||
// ==================== 配置方法 ====================
|
||||
|
||||
public static void setHeadless(boolean headless) {
|
||||
HEADLESS = headless;
|
||||
}
|
||||
|
||||
public static void setSessionReuse(boolean reuse) {
|
||||
SESSION_REUSE = reuse;
|
||||
}
|
||||
|
||||
public static void setGlobalRequestInterval(long minIntervalMs, long jitterMs) {
|
||||
MIN_INTERVAL_MS = Math.max(0, minIntervalMs);
|
||||
JITTER_MS = Math.max(0, jitterMs);
|
||||
}
|
||||
|
||||
public static void setRetryPolicy(int maxRetries, int baseBackoffMs) {
|
||||
MAX_RETRIES = Math.max(1, maxRetries);
|
||||
BASE_BACKOFF_MS = Math.max(1000, baseBackoffMs);
|
||||
}
|
||||
|
||||
public static void setKeepBrowserOpen(boolean keep) {
|
||||
KEEP_BROWSER_OPEN = keep;
|
||||
}
|
||||
|
||||
public static void shutdownDriver() {
|
||||
WebDriver d = TL_DRIVER.get();
|
||||
if (d != null) {
|
||||
try {
|
||||
d.quit();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
TL_DRIVER.remove();
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== 核心方法 ====================
|
||||
|
||||
/**
|
||||
* 获取关键词在指定网站的Google排名
|
||||
*
|
||||
* @param searchText 搜索关键词
|
||||
* @param targetSite 目标网站(如:zh.wikipedia.org)
|
||||
* @return 排名(>0表示找到,-1表示未找到,-2表示遇到验证码,0表示查询失败)
|
||||
*/
|
||||
public static int getGoogleRank(String searchText, String targetSite) {
|
||||
// 入参校验
|
||||
if (searchText == null || searchText.trim().isEmpty() || targetSite == null || targetSite.trim().isEmpty()) {
|
||||
log.error("入参非法:searchText={}, targetSite={}", searchText, targetSite);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 重试策略
|
||||
for (int attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||
int result = getGoogleRankInternal(searchText, targetSite);
|
||||
if (result != -2) { // 不是验证码错误
|
||||
return result;
|
||||
}
|
||||
|
||||
if (attempt < MAX_RETRIES) {
|
||||
int backoff = BASE_BACKOFF_MS * attempt;
|
||||
log.info("遇到验证码,{}毫秒后进行第{}次重试", backoff, attempt + 1);
|
||||
try {
|
||||
Thread.sleep(backoff);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.error("经过{}次尝试后仍然遇到验证码,搜索失败", MAX_RETRIES);
|
||||
return -2;
|
||||
}
|
||||
|
||||
/**
|
||||
* 内部实现:获取Google排名(支持多页搜索)
|
||||
*/
|
||||
private static int getGoogleRankInternal(String searchText, String targetSite) {
|
||||
targetSite = targetSite.trim();
|
||||
String encodedSearchText = encodeSearchText(searchText);
|
||||
if (encodedSearchText == null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 构建基础搜索URL(使用udm=14参数强制网页结果)
|
||||
String baseUrl = GOOGLE_SEARCH_URL + encodedSearchText + "&newwindow=1&hl=zh-CN&pws=0&udm=14";
|
||||
log.info("发起Google搜索请求:url={}, 目标网站={}", baseUrl, targetSite);
|
||||
|
||||
WebDriver driver = null;
|
||||
try {
|
||||
// 初始化/复用 WebDriver
|
||||
driver = SESSION_REUSE ? getOrCreateDriver() : createWebDriver();
|
||||
throttleSleep("before-nav");
|
||||
|
||||
// 先访问 NCR 以避免地区重定向
|
||||
try {
|
||||
driver.get("https://www.google.com/ncr");
|
||||
throttleSleep("after-ncr");
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
// 检查是否跳转到验证页面
|
||||
try {
|
||||
String cur = driver.getCurrentUrl().toLowerCase();
|
||||
if (cur.contains("/sorry/") || cur.contains("captcha")) {
|
||||
log.warn("发现跳转至验证页面: {}", cur);
|
||||
return -2;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("获取当前URL失败: {}", e.getMessage());
|
||||
return -2;
|
||||
}
|
||||
|
||||
// 【优化】搜索前2页(前20名)
|
||||
int rank = -1;
|
||||
for (int page = 1; page <= 2; page++) {
|
||||
String pageUrl = page == 1 ? baseUrl : baseUrl + "&start=" + ((page - 1) * 10);
|
||||
log.info("搜索第{}页: {}", page, pageUrl);
|
||||
|
||||
// 访问Google搜索页面
|
||||
driver.get(pageUrl);
|
||||
throttleSleep("after-search");
|
||||
|
||||
// 显式等待页面加载
|
||||
try {
|
||||
WebDriverWait wait = new WebDriverWait(driver, 20);
|
||||
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("body")));
|
||||
} catch (Exception e) {
|
||||
log.warn("等待页面加载超时: {}", e.getMessage());
|
||||
}
|
||||
|
||||
// 人类化随机等待
|
||||
Thread.sleep((long) (2000 + Math.random() * 3000));
|
||||
|
||||
// 轻微滚动,模拟浏览
|
||||
try {
|
||||
JavascriptExecutor js = (JavascriptExecutor) driver;
|
||||
long steps = 2 + (int) (Math.random() * 3);
|
||||
for (int i = 0; i < steps; i++) {
|
||||
js.executeScript("window.scrollBy(0, arguments[0]);", 200 + (int) (Math.random() * 400));
|
||||
Thread.sleep((long) (400 + Math.random() * 600));
|
||||
}
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
// 检查是否出现验证码
|
||||
if (isCaptchaPage(driver)) {
|
||||
log.warn("检测到验证码页面,本次请求失败");
|
||||
return -2;
|
||||
}
|
||||
|
||||
// 获取页面源码
|
||||
String html = driver.getPageSource();
|
||||
log.debug("第{}页请求成功:响应长度={}字节", page, html.length());
|
||||
|
||||
// 解析排名(使用speculationrules方案,传入baseRank)
|
||||
int baseRank = (page - 1) * 10;
|
||||
int pageRank = parseRankFromSpeculationRules(html, targetSite, baseRank);
|
||||
|
||||
if (pageRank > 0) {
|
||||
rank = pageRank;
|
||||
break; // 找到了,停止搜索
|
||||
}
|
||||
|
||||
// 页面间增加随机延时
|
||||
if (page < 2) {
|
||||
Thread.sleep((long) (2000 + Math.random() * 3000));
|
||||
}
|
||||
}
|
||||
|
||||
return rank;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("获取排名异常:url={}, 原因={}", baseUrl, e.getMessage(), e);
|
||||
return 0;
|
||||
} finally {
|
||||
// 关闭浏览器或复用
|
||||
if (!SESSION_REUSE) {
|
||||
if (driver != null) {
|
||||
if (!KEEP_BROWSER_OPEN) {
|
||||
try {
|
||||
driver.quit();
|
||||
} catch (Exception e) {
|
||||
log.error("关闭浏览器失败", e);
|
||||
}
|
||||
} else {
|
||||
log.info("根据配置保留浏览器窗口开启(KEEP_BROWSER_OPEN=true)");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log.debug("开启会话复用,保留当前线程的 WebDriver 实例");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 【核心方法】从speculationrules脚本中解析排名
|
||||
*
|
||||
* 解析逻辑:
|
||||
* 1. 查找<script type="speculationrules">标签
|
||||
* 2. 提取JSON内容
|
||||
* 3. 解析prefetch数组中的urls
|
||||
* 4. 按顺序匹配目标网站
|
||||
*
|
||||
* @param html HTML源码
|
||||
* @param targetSite 目标网站
|
||||
* @param baseRank 基础排名(第1页=0,第2页=10)
|
||||
* @return 排名(>0表示找到,-1表示未找到)
|
||||
*/
|
||||
private static int parseRankFromSpeculationRules(String html, String targetSite, int baseRank) {
|
||||
try {
|
||||
Document doc = Jsoup.parse(html);
|
||||
|
||||
// 查找所有speculationrules脚本
|
||||
Elements scripts = doc.select("script[type=speculationrules]");
|
||||
|
||||
if (scripts.isEmpty()) {
|
||||
log.warn("未找到speculationrules脚本,可能Google页面结构已变化");
|
||||
return -1;
|
||||
}
|
||||
|
||||
log.info("找到{}个speculationrules脚本", scripts.size());
|
||||
|
||||
// 提取所有URL(按顺序)
|
||||
List<String> allUrls = new ArrayList<>();
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
for (Element script : scripts) {
|
||||
String jsonContent = script.html();
|
||||
try {
|
||||
JsonNode root = mapper.readTree(jsonContent);
|
||||
JsonNode prefetchArray = root.get("prefetch");
|
||||
|
||||
if (prefetchArray != null && prefetchArray.isArray()) {
|
||||
for (JsonNode prefetchItem : prefetchArray) {
|
||||
JsonNode urlsNode = prefetchItem.get("urls");
|
||||
if (urlsNode != null && urlsNode.isArray()) {
|
||||
for (JsonNode urlNode : urlsNode) {
|
||||
String url = urlNode.asText();
|
||||
if (url != null && !url.isEmpty()) {
|
||||
allUrls.add(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("解析speculationrules JSON失败: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
log.info("从speculationrules中提取到{}个URL", allUrls.size());
|
||||
|
||||
// 【调试】打印所有URL
|
||||
for (int i = 0; i < allUrls.size(); i++) {
|
||||
log.debug("URL[{}]: {}", i + 1, allUrls.get(i));
|
||||
}
|
||||
|
||||
// 按顺序匹配目标网站
|
||||
String cleanTarget = normalizeHost(targetSite);
|
||||
log.info("规范化后的目标网站: {}", cleanTarget);
|
||||
|
||||
int validRank = 0; // 有效排名计数器
|
||||
for (int i = 0; i < allUrls.size(); i++) {
|
||||
String url = allUrls.get(i);
|
||||
String host = extractHost(url);
|
||||
|
||||
if (host != null && !host.isEmpty()) {
|
||||
// 过滤Google自身域名
|
||||
if (host.contains("google.")) {
|
||||
log.debug("跳过Google自身域名: {}", host);
|
||||
continue;
|
||||
}
|
||||
|
||||
validRank++; // 只计算非Google的URL
|
||||
|
||||
// 匹配目标网站
|
||||
boolean isMatch = isMatchingHost(host, cleanTarget);
|
||||
log.debug("URL[{}] host={}, target={}, isMatch={}", validRank, host, cleanTarget, isMatch);
|
||||
|
||||
if (isMatch) {
|
||||
int actualRank = baseRank + validRank;
|
||||
log.info("找到目标网站:排名={}, URL={}, host={}, target={}", actualRank, url, host, cleanTarget);
|
||||
return actualRank;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.info("在speculationrules中未找到目标网站:targetSite={}", targetSite);
|
||||
return -1;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("解析speculationrules失败", e);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断host是否匹配目标
|
||||
*/
|
||||
private static boolean isMatchingHost(String host, String cleanTarget) {
|
||||
if (host == null || cleanTarget == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
host = host.toLowerCase();
|
||||
cleanTarget = cleanTarget.toLowerCase();
|
||||
|
||||
// 精确匹配
|
||||
if (host.equals(cleanTarget)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// 子域名匹配
|
||||
if (host.endsWith("." + cleanTarget)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// 父域名匹配
|
||||
if (cleanTarget.endsWith("." + host)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// 模糊匹配(包含关系)
|
||||
if (host.contains(cleanTarget) || cleanTarget.contains(host)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取并规范化host
|
||||
*/
|
||||
private static String extractHost(String urlStr) {
|
||||
try {
|
||||
if (urlStr.startsWith("http")) {
|
||||
URL u = new URL(urlStr);
|
||||
return normalizeHost(u.getHost());
|
||||
}
|
||||
if (urlStr.startsWith("//")) {
|
||||
URL u = new URL("https:" + urlStr);
|
||||
return normalizeHost(u.getHost());
|
||||
}
|
||||
return null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 规范化host
|
||||
*/
|
||||
private static String normalizeHost(String hostOrUrl) {
|
||||
if (hostOrUrl == null || hostOrUrl.isEmpty()) return "";
|
||||
String s = hostOrUrl.trim();
|
||||
s = s.replace("https://", "").replace("http://", "");
|
||||
if (s.contains("/")) s = s.substring(0, s.indexOf('/'));
|
||||
if (s.startsWith("www.")) s = s.substring(4);
|
||||
return s.toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* 编码搜索关键词
|
||||
*/
|
||||
private static String encodeSearchText(String searchText) {
|
||||
try {
|
||||
return URLEncoder.encode(searchText.trim(), StandardCharsets.UTF_8.name());
|
||||
} catch (Exception e) {
|
||||
log.error("关键词编码失败:searchText={}", searchText, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查是否为验证码页面
|
||||
*/
|
||||
private static boolean isCaptchaPage(WebDriver driver) {
|
||||
try {
|
||||
String currentUrl = "";
|
||||
String pageSource = "";
|
||||
try {
|
||||
currentUrl = driver.getCurrentUrl().toLowerCase();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
try {
|
||||
pageSource = driver.getPageSource().toLowerCase();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
|
||||
currentUrl.contains("security-check") ||
|
||||
currentUrl.contains("captcha");
|
||||
|
||||
boolean hasCaptchaKeywords = pageSource.contains("captcha") ||
|
||||
pageSource.contains("recaptcha") ||
|
||||
pageSource.contains("人机验证") ||
|
||||
pageSource.contains("异常流量");
|
||||
|
||||
return isCaptchaUrl || hasCaptchaKeywords;
|
||||
} catch (Exception e) {
|
||||
log.error("检查验证码页面时出错", e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 节流延迟
|
||||
*/
|
||||
private static void throttleSleep(String reason) {
|
||||
long now = System.currentTimeMillis();
|
||||
long last = LAST_NAV_AT.get();
|
||||
long gap = now - last;
|
||||
long need = MIN_INTERVAL_MS - gap;
|
||||
if (need > 0) {
|
||||
try {
|
||||
long extra = (long) (Math.random() * JITTER_MS);
|
||||
Thread.sleep(need + extra);
|
||||
} catch (InterruptedException ie) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
LAST_NAV_AT.set(System.currentTimeMillis());
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取或创建WebDriver
|
||||
*/
|
||||
private static WebDriver getOrCreateDriver() {
|
||||
WebDriver d = TL_DRIVER.get();
|
||||
boolean alive = false;
|
||||
if (d != null) {
|
||||
try {
|
||||
d.getTitle();
|
||||
alive = true;
|
||||
} catch (Exception ignore) {
|
||||
alive = false;
|
||||
}
|
||||
}
|
||||
if (!alive) {
|
||||
if (d != null) {
|
||||
try {
|
||||
d.quit();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
TL_DRIVER.remove();
|
||||
}
|
||||
d = createWebDriver();
|
||||
TL_DRIVER.set(d);
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建WebDriver实例
|
||||
*/
|
||||
private static WebDriver createWebDriver() {
|
||||
try {
|
||||
WebDriverManager.chromedriver().setup();
|
||||
log.info("WebDriverManager 已自动配置 chromedriver");
|
||||
} catch (Throwable t) {
|
||||
log.warn("WebDriverManager 配置失败,尝试使用手动路径: {}", t.getMessage());
|
||||
String sysProp = System.getProperty("webdriver.chrome.driver");
|
||||
String envProp = System.getenv("CHROMEDRIVER_PATH");
|
||||
if (sysProp == null || sysProp.isEmpty()) {
|
||||
String path = (envProp != null && !envProp.isEmpty()) ? envProp : "/usr/local/bin/chromedriver";
|
||||
System.setProperty("webdriver.chrome.driver", path);
|
||||
log.info("使用ChromeDriver路径: {}", path);
|
||||
}
|
||||
}
|
||||
|
||||
ChromeOptions options = new ChromeOptions();
|
||||
|
||||
// 无头模式
|
||||
if (HEADLESS) {
|
||||
options.addArguments("--headless=new");
|
||||
}
|
||||
|
||||
// 语言设置
|
||||
options.addArguments("--lang=zh-CN");
|
||||
Map<String, Object> prefs = new HashMap<>();
|
||||
prefs.put("intl.accept_languages", "zh-CN,zh;q=0.9");
|
||||
options.setExperimentalOption("prefs", prefs);
|
||||
|
||||
// User-Agent随机化
|
||||
String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
|
||||
options.addArguments("--user-agent=" + randomUserAgent);
|
||||
|
||||
// 窗口大小
|
||||
int width = 1024 + (int) (Math.random() * 600);
|
||||
int height = 700 + (int) (Math.random() * 500);
|
||||
options.addArguments("--window-size=" + width + "," + height);
|
||||
|
||||
// 减少自动化特征
|
||||
options.addArguments("--disable-blink-features=AutomationControlled");
|
||||
options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
|
||||
options.addArguments("--disable-extensions");
|
||||
options.addArguments("--no-sandbox");
|
||||
options.addArguments("--disable-dev-shm-usage");
|
||||
|
||||
// 是否保留浏览器窗口
|
||||
options.setExperimentalOption("detach", KEEP_BROWSER_OPEN);
|
||||
|
||||
WebDriver driver = new ChromeDriver(options);
|
||||
|
||||
// 隐藏webdriver标志
|
||||
try {
|
||||
((JavascriptExecutor) driver).executeScript(
|
||||
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
return driver;
|
||||
}
|
||||
|
||||
/**
|
||||
* 格式化排名输出
|
||||
*/
|
||||
private static String formatRank(int rank) {
|
||||
if (rank > 0) return rank + "名";
|
||||
else if (rank == -1) return "未找到";
|
||||
else if (rank == -2) return "遇到验证码";
|
||||
else return "查询失败";
|
||||
}
|
||||
|
||||
// ==================== 测试方法 ====================
|
||||
|
||||
/**
|
||||
* 批量测试方法:一次性采集多个关键词的维基百科排名
|
||||
*/
|
||||
public static void main(String[] args) throws InterruptedException {
|
||||
// 配置参数
|
||||
GoogleRankUtil.setHeadless(false); // Mac环境建议 false
|
||||
GoogleRankUtil.setSessionReuse(true); // 复用浏览器会话
|
||||
GoogleRankUtil.setGlobalRequestInterval(8000, 5000); // 8s + 抖动 0~5s
|
||||
GoogleRankUtil.setRetryPolicy(3, 10000); // 重试3次,递增退避
|
||||
GoogleRankUtil.setKeepBrowserOpen(false); // 结束后关闭
|
||||
|
||||
// 测试关键词(先测试1个)
|
||||
List<String> keywords = Arrays.asList("货车","大卡车");
|
||||
String site = "zh.wikipedia.org";
|
||||
|
||||
System.out.println("========== GoogleRankUtil 测试 ==========");
|
||||
System.out.println("解析方案:speculationrules(稳定、准确)");
|
||||
System.out.println("目标站点: " + site);
|
||||
System.out.println("关键词数量: " + keywords.size());
|
||||
System.out.println("=========================================\n");
|
||||
|
||||
for (int i = 0; i < keywords.size(); i++) {
|
||||
String kw = keywords.get(i);
|
||||
System.out.println("[" + (i + 1) + "/" + keywords.size() + "] 正在采集关键词: " + kw);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
int rank = getGoogleRank(kw, site);
|
||||
long elapsed = System.currentTimeMillis() - startTime;
|
||||
|
||||
System.out.println("关键词[" + kw + "] 维基百科排名: " + formatRank(rank));
|
||||
System.out.println("耗时: " + elapsed + "ms");
|
||||
System.out.println("-----------------------------------\n");
|
||||
}
|
||||
|
||||
System.out.println("========== 采集完成 ==========");
|
||||
// 清理
|
||||
GoogleRankUtil.shutdownDriver();
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user