From 09f1220ae38f2c4d31da86febeb8c19d9631f7b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B1=E6=98=A5=E4=BA=91?= <1994398261@qq.com>
Date: Wed, 3 Sep 2025 22:07:26 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AF=8C=E6=96=87=E6=9C=AC=E7=BC=96=E8=BE=91?=
=?UTF-8?q?=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
dl_admin/ruoyi-common/pom.xml | 36 ++
.../com/ruoyi/common/utils/GoogleUtil.java | 360 ++++++++++++++++++
2 files changed, 396 insertions(+)
create mode 100644 dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java
diff --git a/dl_admin/ruoyi-common/pom.xml b/dl_admin/ruoyi-common/pom.xml
index 664dbd6..e939832 100644
--- a/dl_admin/ruoyi-common/pom.xml
+++ b/dl_admin/ruoyi-common/pom.xml
@@ -171,6 +171,42 @@
3.1.0
+
+
+
+ org.jsoup
+ jsoup
+ 1.17.2
+
+
+
+ com.github.rholder
+ guava-retrying
+ 2.0.0
+
+
+
+
+ org.seleniumhq.selenium
+ selenium-java
+ 3.141.59
+
+
+
+
+ io.github.bonigarcia
+ webdrivermanager
+ 5.6.3
+
+
+
+
+ org.apache.httpcomponents.client5
+ httpclient5
+ 5.2.1
+
+
+
diff --git a/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java b/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java
new file mode 100644
index 0000000..5310f35
--- /dev/null
+++ b/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java
@@ -0,0 +1,360 @@
+package com.ruoyi.common.utils;
+
+import com.github.rholder.retry.Retryer;
+import com.github.rholder.retry.RetryerBuilder;
+import com.github.rholder.retry.StopStrategies;
+import com.github.rholder.retry.WaitStrategies;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.chrome.ChromeOptions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+public class GoogleUtil {
+ private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class);
+ // 可配置参数(Ruoyi项目建议用@Value从application.yml读取)
+ private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q=";
+ private static final String PROXY_HOST = "127.0.0.1";
+ private static final int PROXY_PORT = 7897;
+ private static final int CONNECT_TIMEOUT = 10000;
+ private static final int READ_TIMEOUT = 15000;
+ // 重试机制
+ private static final Retryer RETRYER = RetryerBuilder.newBuilder()
+ .retryIfExceptionOfType(Exception.class)
+ .retryIfResult(result -> result != null && !result)
+ .withStopStrategy(StopStrategies.stopAfterAttempt(3))
+ .withWaitStrategy(WaitStrategies.fixedWait(1, TimeUnit.SECONDS))
+ .build();
+
+ // 扩展User-Agent池(增加移动端标识,反爬更友好)
+ private static final List USER_AGENTS = Arrays.asList(
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (X11; Linux x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
+ "Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0"
+ );
+
+ /**
+ * 核心方法:获取Google搜索排名(使用Selenium实现)
+ */
+ public static int getGoogleRank(String searchText, String webSite) {
+ // 入参校验
+ if (searchText == null || searchText.trim().isEmpty() || webSite == null || webSite.trim().isEmpty()) {
+ log.error("入参非法:searchText={}, webSite={}", searchText, webSite);
+ return 0;
+ }
+
+ // 最多重试3次
+ int maxRetries = 3;
+ for (int attempt = 1; attempt <= maxRetries; attempt++) {
+ int result = getGoogleRankInternal(searchText, webSite);
+ if (result != -2) { // 不是验证码错误
+ return result;
+ }
+
+ if (attempt < maxRetries) {
+ log.info("遇到验证码,{}秒后进行第{}次重试", 5 * attempt, attempt + 1);
+ try {
+ Thread.sleep(5000 * attempt); // 递增延迟重试
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+ }
+
+ log.error("经过{}次尝试后仍然遇到验证码,搜索失败", maxRetries);
+ return -2;
+ }
+
+ /**
+ * 内部方法:执行Google搜索排名获取
+ */
+ private static int getGoogleRankInternal(String searchText, String webSite) {
+ webSite = webSite.trim();
+ String encodedSearchText = encodeSearchText(searchText);
+ if (encodedSearchText == null) {
+ return 0;
+ }
+ String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText;
+ log.info("发起Google搜索请求:url={}, 目标网站={}", finalUrl, webSite);
+
+ WebDriver driver = null;
+ try {
+ // 初始化WebDriver
+ driver = createWebDriver();
+
+ int rank = -1;
+ // 搜索前三页
+ for (int page = 1; page <= 3; page++) {
+ String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10);
+ log.info("搜索第{}页: {}", page, pageUrl);
+
+ // 访问Google搜索页面
+ driver.get(pageUrl);
+
+ // 随机延时,模拟人类行为(增加延时以减少被识别为机器人的可能性)
+ Thread.sleep((long) (3000 + Math.random() * 5000));
+
+ // 检查是否出现了验证码页面
+ if (isCaptchaPage(driver)) {
+ log.warn("检测到验证码页面,本次请求失败");
+ return -2; // 特殊返回值表示遇到验证码
+ }
+
+ // 获取页面源码
+ String html = driver.getPageSource();
+ log.debug("第{}页请求成功:响应长度={}字节", page, html.length());
+
+ // 解析当前页的排名
+ int pageRank = parseRankFromHtml(html, webSite, (page - 1) * 10);
+ if (pageRank > 0) {
+ rank = pageRank;
+ break;
+ }
+
+ // 页面间增加随机延时
+ if (page < 3) {
+ Thread.sleep((long) (2000 + Math.random() * 3000));
+ }
+ }
+
+ return rank;
+ } catch (Exception e) {
+ log.error("获取排名异常:url={}, 原因={}", finalUrl, e.getMessage(), e);
+ return 0;
+ } finally {
+ // 关闭浏览器
+ if (driver != null) {
+ try {
+ driver.quit();
+ } catch (Exception e) {
+ log.error("关闭浏览器失败", e);
+ }
+ }
+ }
+ }
+
+ /**
+ * 创建WebDriver实例
+ * @return WebDriver
+ */
+ private static WebDriver createWebDriver() {
+ // 设置ChromeDriver路径(根据实际路径调整)
+ System.setProperty("webdriver.chrome.driver", "D:/chromedriver-win64/chromedriver.exe");
+
+ ChromeOptions options = new ChromeOptions();
+ // 设置无头模式 (Selenium 3.x语法)
+ options.addArguments("--headless");
+ // 设置User-Agent
+ String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
+ options.addArguments("--user-agent=" + randomUserAgent);
+ // 禁用图片加载提高速度
+ options.addArguments("--blink-settings=imagesEnabled=false");
+ // 设置窗口大小
+ options.addArguments("--window-size=1920,1080");
+ // 禁用自动化控制特征
+ options.addArguments("--disable-blink-features=AutomationControlled");
+ options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
+ // 禁用自动化标志
+ options.addArguments("--disable-extensions");
+ options.addArguments("--no-sandbox");
+ options.addArguments("--disable-dev-shm-usage");
+ // 禁用SSL错误
+ options.addArguments("--ignore-ssl-errors");
+ options.addArguments("--ignore-certificate-errors");
+ // 禁用日志
+ options.addArguments("--log-level=3");
+ options.addArguments("--silent");
+
+ // 如果需要使用代理(轮换代理IP可以有效避免验证码)
+ /*
+ if (PROXY_HOST != null && !PROXY_HOST.isEmpty()) {
+ // 可以在这里集成代理IP服务,每次使用不同的IP
+ options.addArguments("--proxy-server=http://" + PROXY_HOST + ":" + PROXY_PORT);
+ }
+ */
+
+ WebDriver driver = new ChromeDriver(options);
+ // 执行JavaScript隐藏webdriver属性
+ ((org.openqa.selenium.JavascriptExecutor) driver).executeScript(
+ "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
+
+ return driver;
+ }
+
+ /**
+ * 编码搜索关键词
+ */
+ private static String encodeSearchText(String searchText) {
+ try {
+ return URLEncoder.encode(searchText.trim(), StandardCharsets.UTF_8.name());
+ } catch (Exception e) {
+ log.error("关键词编码失败:searchText={}, 编码格式={}", searchText, StandardCharsets.UTF_8.name(), e);
+ return null;
+ }
+ }
+
+ /**
+ * 解析排名
+ */
+ private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) {
+ Document doc = Jsoup.parse(html);
+
+ // 获取所有可能的搜索结果容器
+ Elements resultContainers = doc.select("div.g");
+
+ if (resultContainers.isEmpty()) {
+ log.warn("未找到任何搜索结果容器");
+ return -1;
+ }
+
+ log.debug("找到 {} 个搜索结果容器", resultContainers.size());
+
+ String cleanTarget = targetWebSite
+ .replace("https://", "")
+ .replace("http://", "")
+ .replace("www.", "");
+
+ // 如果目标网站包含路径,则只取域名部分进行比较
+ if (cleanTarget.contains("/")) {
+ cleanTarget = cleanTarget.substring(0, cleanTarget.indexOf("/"));
+ }
+
+ int validResultCount = 0;
+
+ for (Element container : resultContainers) {
+ // 跳过图片和视频区块
+ if (container.select("div.XaIwc, div.islrc, div.JTuIPc, div.PiKbc, div[jscontroller]").size() > 0) {
+ continue;
+ }
+
+ // 查找容器中的链接
+ Elements links = container.select("a[href]");
+ if (links.isEmpty()) {
+ continue;
+ }
+
+ boolean isSearchResult = false;
+ String resultUrl = "";
+
+ // 查找有效的搜索结果链接
+ for (Element link : links) {
+ String href = link.attr("href");
+
+ // 跳过Google内部链接
+ if (href.contains("google.com") || href.startsWith("/url?") || href.startsWith("#")) {
+ continue;
+ }
+
+ // 检查是否是标题链接(通常在h3标签内)
+ if (link.parent() != null && link.parent().tagName().equals("h3")) {
+ isSearchResult = true;
+ resultUrl = href;
+ break;
+ }
+
+ // 或者检查是否在常见的搜索结果区块中
+ Element parent = link.parent();
+ while (parent != null) {
+ if (parent.hasClass("yuRUbf") || parent.hasClass("tF2Cxc")) {
+ isSearchResult = true;
+ resultUrl = href;
+ break;
+ }
+ parent = parent.parent();
+ }
+
+ if (isSearchResult) {
+ break;
+ }
+ }
+
+ if (!isSearchResult) {
+ continue;
+ }
+
+ validResultCount++;
+
+ String cleanResult = resultUrl
+ .replace("https://", "")
+ .replace("http://", "")
+ .replace("www.", "");
+
+ // 如果结果URL包含路径,则只取域名部分进行比较
+ if (cleanResult.contains("/")) {
+ int firstSlash = cleanResult.indexOf("/");
+ if (firstSlash > 0) {
+ cleanResult = cleanResult.substring(0, firstSlash);
+ }
+ }
+
+ // 匹配目标网站
+ if (cleanResult.contains(cleanTarget) || cleanTarget.contains(cleanResult)) {
+ int actualRank = baseRank + validResultCount;
+ log.info("找到目标网站:排名={}, 结果URL={}, 目标网站={}", actualRank, resultUrl, targetWebSite);
+ return actualRank;
+ }
+ }
+
+ log.info("在当前页面未找到目标网站:targetWebSite={}", targetWebSite);
+ return -1;
+ }
+
+ /**
+ * 检查是否为验证码页面
+ * @param driver WebDriver实例
+ * @return 是否为验证码页面
+ */
+ private static boolean isCaptchaPage(WebDriver driver) {
+ try {
+ String pageSource = driver.getPageSource().toLowerCase();
+ String currentUrl = driver.getCurrentUrl().toLowerCase();
+
+ // 检查页面是否包含验证码相关关键词
+ boolean hasCaptchaKeywords = pageSource.contains("captcha") ||
+ pageSource.contains("recaptcha") ||
+ pageSource.contains("人机验证") ||
+ pageSource.contains("异常流量") ||
+ pageSource.contains("security check") ||
+ pageSource.contains("sorry/index") ||
+ pageSource.contains("before we can serve your request");
+
+ // 检查URL是否为验证码页面
+ boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
+ currentUrl.contains("security-check") ||
+ currentUrl.contains("captcha");
+
+ return hasCaptchaKeywords || isCaptchaUrl;
+ } catch (Exception e) {
+ log.error("检查验证码页面时出错", e);
+ return false;
+ }
+ }
+
+ // 测试方法
+ public static void main(String[] args) {
+ // 测试用例:搜索关键词,查找特定网站的排名
+ int rank1 = getGoogleRank("大卡车", "zh.wikipedia.org");
+ System.out.println("维基百科排名:" + formatRank(rank1));
+ }
+
+ // 辅助方法:格式化排名输出
+ private static String formatRank(int rank) {
+ if (rank > 0) return rank + "名";
+ else if (rank == -1) return "未找到";
+ else if (rank == -2) return "遇到验证码";
+ else return "查询失败";
+ }
+}