From 09f1220ae38f2c4d31da86febeb8c19d9631f7b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=98=A5=E4=BA=91?= <1994398261@qq.com> Date: Wed, 3 Sep 2025 22:07:26 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=8C=E6=96=87=E6=9C=AC=E7=BC=96=E8=BE=91?= =?UTF-8?q?=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dl_admin/ruoyi-common/pom.xml | 36 ++ .../com/ruoyi/common/utils/GoogleUtil.java | 360 ++++++++++++++++++ 2 files changed, 396 insertions(+) create mode 100644 dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java diff --git a/dl_admin/ruoyi-common/pom.xml b/dl_admin/ruoyi-common/pom.xml index 664dbd6..e939832 100644 --- a/dl_admin/ruoyi-common/pom.xml +++ b/dl_admin/ruoyi-common/pom.xml @@ -171,6 +171,42 @@ 3.1.0 + + + + org.jsoup + jsoup + 1.17.2 + + + + com.github.rholder + guava-retrying + 2.0.0 + + + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + + + io.github.bonigarcia + webdrivermanager + 5.6.3 + + + + + org.apache.httpcomponents.client5 + httpclient5 + 5.2.1 + + + diff --git a/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java b/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java new file mode 100644 index 0000000..5310f35 --- /dev/null +++ b/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java @@ -0,0 +1,360 @@ +package com.ruoyi.common.utils; + +import com.github.rholder.retry.Retryer; +import com.github.rholder.retry.RetryerBuilder; +import com.github.rholder.retry.StopStrategies; +import com.github.rholder.retry.WaitStrategies; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; + +public class GoogleUtil { + private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class); + // 可配置参数(Ruoyi项目建议用@Value从application.yml读取) + private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q="; + private static final String PROXY_HOST = "127.0.0.1"; + private static final int PROXY_PORT = 7897; + private static final int CONNECT_TIMEOUT = 10000; + private static final int READ_TIMEOUT = 15000; + // 重试机制 + private static final Retryer RETRYER = RetryerBuilder.newBuilder() + .retryIfExceptionOfType(Exception.class) + .retryIfResult(result -> result != null && !result) + .withStopStrategy(StopStrategies.stopAfterAttempt(3)) + .withWaitStrategy(WaitStrategies.fixedWait(1, TimeUnit.SECONDS)) + .build(); + + // 扩展User-Agent池(增加移动端标识,反爬更友好) + private static final List USER_AGENTS = Arrays.asList( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0" + ); + + /** + * 核心方法:获取Google搜索排名(使用Selenium实现) + */ + public static int getGoogleRank(String searchText, String webSite) { + // 入参校验 + if (searchText == null || searchText.trim().isEmpty() || webSite == null || webSite.trim().isEmpty()) { + log.error("入参非法:searchText={}, webSite={}", searchText, webSite); + return 0; + } + + // 最多重试3次 + int maxRetries = 3; + for (int attempt = 1; attempt <= maxRetries; attempt++) { + int result = getGoogleRankInternal(searchText, webSite); + if (result != -2) { // 不是验证码错误 + return result; + } + + if (attempt < maxRetries) { + log.info("遇到验证码,{}秒后进行第{}次重试", 5 * attempt, attempt + 1); + try { + Thread.sleep(5000 * attempt); // 递增延迟重试 + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + } + + log.error("经过{}次尝试后仍然遇到验证码,搜索失败", maxRetries); + return -2; + } + + /** + * 内部方法:执行Google搜索排名获取 + */ + private static int getGoogleRankInternal(String searchText, String webSite) { + webSite = webSite.trim(); + String encodedSearchText = encodeSearchText(searchText); + if (encodedSearchText == null) { + return 0; + } + String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText; + log.info("发起Google搜索请求:url={}, 目标网站={}", finalUrl, webSite); + + WebDriver driver = null; + try { + // 初始化WebDriver + driver = createWebDriver(); + + int rank = -1; + // 搜索前三页 + for (int page = 1; page <= 3; page++) { + String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10); + log.info("搜索第{}页: {}", page, pageUrl); + + // 访问Google搜索页面 + driver.get(pageUrl); + + // 随机延时,模拟人类行为(增加延时以减少被识别为机器人的可能性) + Thread.sleep((long) (3000 + Math.random() * 5000)); + + // 检查是否出现了验证码页面 + if (isCaptchaPage(driver)) { + log.warn("检测到验证码页面,本次请求失败"); + return -2; // 特殊返回值表示遇到验证码 + } + + // 获取页面源码 + String html = driver.getPageSource(); + log.debug("第{}页请求成功:响应长度={}字节", page, html.length()); + + // 解析当前页的排名 + int pageRank = parseRankFromHtml(html, webSite, (page - 1) * 10); + if (pageRank > 0) { + rank = pageRank; + break; + } + + // 页面间增加随机延时 + if (page < 3) { + Thread.sleep((long) (2000 + Math.random() * 3000)); + } + } + + return rank; + } catch (Exception e) { + log.error("获取排名异常:url={}, 原因={}", finalUrl, e.getMessage(), e); + return 0; + } finally { + // 关闭浏览器 + if (driver != null) { + try { + driver.quit(); + } catch (Exception e) { + log.error("关闭浏览器失败", e); + } + } + } + } + + /** + * 创建WebDriver实例 + * @return WebDriver + */ + private static WebDriver createWebDriver() { + // 设置ChromeDriver路径(根据实际路径调整) + System.setProperty("webdriver.chrome.driver", "D:/chromedriver-win64/chromedriver.exe"); + + ChromeOptions options = new ChromeOptions(); + // 设置无头模式 (Selenium 3.x语法) + options.addArguments("--headless"); + // 设置User-Agent + String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size())); + options.addArguments("--user-agent=" + randomUserAgent); + // 禁用图片加载提高速度 + options.addArguments("--blink-settings=imagesEnabled=false"); + // 设置窗口大小 + options.addArguments("--window-size=1920,1080"); + // 禁用自动化控制特征 + options.addArguments("--disable-blink-features=AutomationControlled"); + options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); + // 禁用自动化标志 + options.addArguments("--disable-extensions"); + options.addArguments("--no-sandbox"); + options.addArguments("--disable-dev-shm-usage"); + // 禁用SSL错误 + options.addArguments("--ignore-ssl-errors"); + options.addArguments("--ignore-certificate-errors"); + // 禁用日志 + options.addArguments("--log-level=3"); + options.addArguments("--silent"); + + // 如果需要使用代理(轮换代理IP可以有效避免验证码) + /* + if (PROXY_HOST != null && !PROXY_HOST.isEmpty()) { + // 可以在这里集成代理IP服务,每次使用不同的IP + options.addArguments("--proxy-server=http://" + PROXY_HOST + ":" + PROXY_PORT); + } + */ + + WebDriver driver = new ChromeDriver(options); + // 执行JavaScript隐藏webdriver属性 + ((org.openqa.selenium.JavascriptExecutor) driver).executeScript( + "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"); + + return driver; + } + + /** + * 编码搜索关键词 + */ + private static String encodeSearchText(String searchText) { + try { + return URLEncoder.encode(searchText.trim(), StandardCharsets.UTF_8.name()); + } catch (Exception e) { + log.error("关键词编码失败:searchText={}, 编码格式={}", searchText, StandardCharsets.UTF_8.name(), e); + return null; + } + } + + /** + * 解析排名 + */ + private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) { + Document doc = Jsoup.parse(html); + + // 获取所有可能的搜索结果容器 + Elements resultContainers = doc.select("div.g"); + + if (resultContainers.isEmpty()) { + log.warn("未找到任何搜索结果容器"); + return -1; + } + + log.debug("找到 {} 个搜索结果容器", resultContainers.size()); + + String cleanTarget = targetWebSite + .replace("https://", "") + .replace("http://", "") + .replace("www.", ""); + + // 如果目标网站包含路径,则只取域名部分进行比较 + if (cleanTarget.contains("/")) { + cleanTarget = cleanTarget.substring(0, cleanTarget.indexOf("/")); + } + + int validResultCount = 0; + + for (Element container : resultContainers) { + // 跳过图片和视频区块 + if (container.select("div.XaIwc, div.islrc, div.JTuIPc, div.PiKbc, div[jscontroller]").size() > 0) { + continue; + } + + // 查找容器中的链接 + Elements links = container.select("a[href]"); + if (links.isEmpty()) { + continue; + } + + boolean isSearchResult = false; + String resultUrl = ""; + + // 查找有效的搜索结果链接 + for (Element link : links) { + String href = link.attr("href"); + + // 跳过Google内部链接 + if (href.contains("google.com") || href.startsWith("/url?") || href.startsWith("#")) { + continue; + } + + // 检查是否是标题链接(通常在h3标签内) + if (link.parent() != null && link.parent().tagName().equals("h3")) { + isSearchResult = true; + resultUrl = href; + break; + } + + // 或者检查是否在常见的搜索结果区块中 + Element parent = link.parent(); + while (parent != null) { + if (parent.hasClass("yuRUbf") || parent.hasClass("tF2Cxc")) { + isSearchResult = true; + resultUrl = href; + break; + } + parent = parent.parent(); + } + + if (isSearchResult) { + break; + } + } + + if (!isSearchResult) { + continue; + } + + validResultCount++; + + String cleanResult = resultUrl + .replace("https://", "") + .replace("http://", "") + .replace("www.", ""); + + // 如果结果URL包含路径,则只取域名部分进行比较 + if (cleanResult.contains("/")) { + int firstSlash = cleanResult.indexOf("/"); + if (firstSlash > 0) { + cleanResult = cleanResult.substring(0, firstSlash); + } + } + + // 匹配目标网站 + if (cleanResult.contains(cleanTarget) || cleanTarget.contains(cleanResult)) { + int actualRank = baseRank + validResultCount; + log.info("找到目标网站:排名={}, 结果URL={}, 目标网站={}", actualRank, resultUrl, targetWebSite); + return actualRank; + } + } + + log.info("在当前页面未找到目标网站:targetWebSite={}", targetWebSite); + return -1; + } + + /** + * 检查是否为验证码页面 + * @param driver WebDriver实例 + * @return 是否为验证码页面 + */ + private static boolean isCaptchaPage(WebDriver driver) { + try { + String pageSource = driver.getPageSource().toLowerCase(); + String currentUrl = driver.getCurrentUrl().toLowerCase(); + + // 检查页面是否包含验证码相关关键词 + boolean hasCaptchaKeywords = pageSource.contains("captcha") || + pageSource.contains("recaptcha") || + pageSource.contains("人机验证") || + pageSource.contains("异常流量") || + pageSource.contains("security check") || + pageSource.contains("sorry/index") || + pageSource.contains("before we can serve your request"); + + // 检查URL是否为验证码页面 + boolean isCaptchaUrl = currentUrl.contains("sorry/index") || + currentUrl.contains("security-check") || + currentUrl.contains("captcha"); + + return hasCaptchaKeywords || isCaptchaUrl; + } catch (Exception e) { + log.error("检查验证码页面时出错", e); + return false; + } + } + + // 测试方法 + public static void main(String[] args) { + // 测试用例:搜索关键词,查找特定网站的排名 + int rank1 = getGoogleRank("大卡车", "zh.wikipedia.org"); + System.out.println("维基百科排名:" + formatRank(rank1)); + } + + // 辅助方法:格式化排名输出 + private static String formatRank(int rank) { + if (rank > 0) return rank + "名"; + else if (rank == -1) return "未找到"; + else if (rank == -2) return "遇到验证码"; + else return "查询失败"; + } +}