Compare commits

..

2 Commits

Author SHA1 Message Date
471e0a96fa Merge remote-tracking branch 'origin/master' 2025-09-05 14:24:58 +08:00
09f1220ae3 富文本编辑器 2025-09-03 22:07:26 +08:00
2 changed files with 396 additions and 0 deletions

View File

@ -171,6 +171,42 @@
<version>3.1.0</version>
</dependency>
<!-- Jsoup HTML解析 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>com.github.rholder</groupId>
<artifactId>guava-retrying</artifactId>
<version>2.0.0</version>
</dependency>
<!-- Selenium WebDriver -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version> <!-- 使用这个版本 -->
</dependency>
<!-- WebDriverManager -->
<dependency>
<groupId>io.github.bonigarcia</groupId>
<artifactId>webdrivermanager</artifactId>
<version>5.6.3</version>
</dependency>
<!-- HttpClient dependencies for WebDriverManager -->
<dependency>
<groupId>org.apache.httpcomponents.client5</groupId>
<artifactId>httpclient5</artifactId>
<version>5.2.1</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,360 @@
package com.ruoyi.common.utils;
import com.github.rholder.retry.Retryer;
import com.github.rholder.retry.RetryerBuilder;
import com.github.rholder.retry.StopStrategies;
import com.github.rholder.retry.WaitStrategies;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeUnit;
public class GoogleUtil {
private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class);
// 可配置参数Ruoyi项目建议用@Value从application.yml读取
private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q=";
private static final String PROXY_HOST = "127.0.0.1";
private static final int PROXY_PORT = 7897;
private static final int CONNECT_TIMEOUT = 10000;
private static final int READ_TIMEOUT = 15000;
// 重试机制
private static final Retryer<Boolean> RETRYER = RetryerBuilder.<Boolean>newBuilder()
.retryIfExceptionOfType(Exception.class)
.retryIfResult(result -> result != null && !result)
.withStopStrategy(StopStrategies.stopAfterAttempt(3))
.withWaitStrategy(WaitStrategies.fixedWait(1, TimeUnit.SECONDS))
.build();
// 扩展User-Agent池增加移动端标识反爬更友好
private static final List<String> USER_AGENTS = Arrays.asList(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0"
);
/**
* 核心方法获取Google搜索排名使用Selenium实现
*/
public static int getGoogleRank(String searchText, String webSite) {
// 入参校验
if (searchText == null || searchText.trim().isEmpty() || webSite == null || webSite.trim().isEmpty()) {
log.error("入参非法searchText={}, webSite={}", searchText, webSite);
return 0;
}
// 最多重试3次
int maxRetries = 3;
for (int attempt = 1; attempt <= maxRetries; attempt++) {
int result = getGoogleRankInternal(searchText, webSite);
if (result != -2) { // 不是验证码错误
return result;
}
if (attempt < maxRetries) {
log.info("遇到验证码,{}秒后进行第{}次重试", 5 * attempt, attempt + 1);
try {
Thread.sleep(5000 * attempt); // 递增延迟重试
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
log.error("经过{}次尝试后仍然遇到验证码,搜索失败", maxRetries);
return -2;
}
/**
* 内部方法执行Google搜索排名获取
*/
private static int getGoogleRankInternal(String searchText, String webSite) {
webSite = webSite.trim();
String encodedSearchText = encodeSearchText(searchText);
if (encodedSearchText == null) {
return 0;
}
String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText;
log.info("发起Google搜索请求url={}, 目标网站={}", finalUrl, webSite);
WebDriver driver = null;
try {
// 初始化WebDriver
driver = createWebDriver();
int rank = -1;
// 搜索前三页
for (int page = 1; page <= 3; page++) {
String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10);
log.info("搜索第{}页: {}", page, pageUrl);
// 访问Google搜索页面
driver.get(pageUrl);
// 随机延时模拟人类行为增加延时以减少被识别为机器人的可能性
Thread.sleep((long) (3000 + Math.random() * 5000));
// 检查是否出现了验证码页面
if (isCaptchaPage(driver)) {
log.warn("检测到验证码页面,本次请求失败");
return -2; // 特殊返回值表示遇到验证码
}
// 获取页面源码
String html = driver.getPageSource();
log.debug("第{}页请求成功:响应长度={}字节", page, html.length());
// 解析当前页的排名
int pageRank = parseRankFromHtml(html, webSite, (page - 1) * 10);
if (pageRank > 0) {
rank = pageRank;
break;
}
// 页面间增加随机延时
if (page < 3) {
Thread.sleep((long) (2000 + Math.random() * 3000));
}
}
return rank;
} catch (Exception e) {
log.error("获取排名异常url={}, 原因={}", finalUrl, e.getMessage(), e);
return 0;
} finally {
// 关闭浏览器
if (driver != null) {
try {
driver.quit();
} catch (Exception e) {
log.error("关闭浏览器失败", e);
}
}
}
}
/**
* 创建WebDriver实例
* @return WebDriver
*/
private static WebDriver createWebDriver() {
// 设置ChromeDriver路径根据实际路径调整
System.setProperty("webdriver.chrome.driver", "D:/chromedriver-win64/chromedriver.exe");
ChromeOptions options = new ChromeOptions();
// 设置无头模式 (Selenium 3.x语法)
options.addArguments("--headless");
// 设置User-Agent
String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
options.addArguments("--user-agent=" + randomUserAgent);
// 禁用图片加载提高速度
options.addArguments("--blink-settings=imagesEnabled=false");
// 设置窗口大小
options.addArguments("--window-size=1920,1080");
// 禁用自动化控制特征
options.addArguments("--disable-blink-features=AutomationControlled");
options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
// 禁用自动化标志
options.addArguments("--disable-extensions");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
// 禁用SSL错误
options.addArguments("--ignore-ssl-errors");
options.addArguments("--ignore-certificate-errors");
// 禁用日志
options.addArguments("--log-level=3");
options.addArguments("--silent");
// 如果需要使用代理轮换代理IP可以有效避免验证码
/*
if (PROXY_HOST != null && !PROXY_HOST.isEmpty()) {
// 可以在这里集成代理IP服务每次使用不同的IP
options.addArguments("--proxy-server=http://" + PROXY_HOST + ":" + PROXY_PORT);
}
*/
WebDriver driver = new ChromeDriver(options);
// 执行JavaScript隐藏webdriver属性
((org.openqa.selenium.JavascriptExecutor) driver).executeScript(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
return driver;
}
/**
* 编码搜索关键词
*/
private static String encodeSearchText(String searchText) {
try {
return URLEncoder.encode(searchText.trim(), StandardCharsets.UTF_8.name());
} catch (Exception e) {
log.error("关键词编码失败searchText={}, 编码格式={}", searchText, StandardCharsets.UTF_8.name(), e);
return null;
}
}
/**
* 解析排名
*/
private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) {
Document doc = Jsoup.parse(html);
// 获取所有可能的搜索结果容器
Elements resultContainers = doc.select("div.g");
if (resultContainers.isEmpty()) {
log.warn("未找到任何搜索结果容器");
return -1;
}
log.debug("找到 {} 个搜索结果容器", resultContainers.size());
String cleanTarget = targetWebSite
.replace("https://", "")
.replace("http://", "")
.replace("www.", "");
// 如果目标网站包含路径则只取域名部分进行比较
if (cleanTarget.contains("/")) {
cleanTarget = cleanTarget.substring(0, cleanTarget.indexOf("/"));
}
int validResultCount = 0;
for (Element container : resultContainers) {
// 跳过图片和视频区块
if (container.select("div.XaIwc, div.islrc, div.JTuIPc, div.PiKbc, div[jscontroller]").size() > 0) {
continue;
}
// 查找容器中的链接
Elements links = container.select("a[href]");
if (links.isEmpty()) {
continue;
}
boolean isSearchResult = false;
String resultUrl = "";
// 查找有效的搜索结果链接
for (Element link : links) {
String href = link.attr("href");
// 跳过Google内部链接
if (href.contains("google.com") || href.startsWith("/url?") || href.startsWith("#")) {
continue;
}
// 检查是否是标题链接通常在h3标签内
if (link.parent() != null && link.parent().tagName().equals("h3")) {
isSearchResult = true;
resultUrl = href;
break;
}
// 或者检查是否在常见的搜索结果区块中
Element parent = link.parent();
while (parent != null) {
if (parent.hasClass("yuRUbf") || parent.hasClass("tF2Cxc")) {
isSearchResult = true;
resultUrl = href;
break;
}
parent = parent.parent();
}
if (isSearchResult) {
break;
}
}
if (!isSearchResult) {
continue;
}
validResultCount++;
String cleanResult = resultUrl
.replace("https://", "")
.replace("http://", "")
.replace("www.", "");
// 如果结果URL包含路径则只取域名部分进行比较
if (cleanResult.contains("/")) {
int firstSlash = cleanResult.indexOf("/");
if (firstSlash > 0) {
cleanResult = cleanResult.substring(0, firstSlash);
}
}
// 匹配目标网站
if (cleanResult.contains(cleanTarget) || cleanTarget.contains(cleanResult)) {
int actualRank = baseRank + validResultCount;
log.info("找到目标网站:排名={}, 结果URL={}, 目标网站={}", actualRank, resultUrl, targetWebSite);
return actualRank;
}
}
log.info("在当前页面未找到目标网站targetWebSite={}", targetWebSite);
return -1;
}
/**
* 检查是否为验证码页面
* @param driver WebDriver实例
* @return 是否为验证码页面
*/
private static boolean isCaptchaPage(WebDriver driver) {
try {
String pageSource = driver.getPageSource().toLowerCase();
String currentUrl = driver.getCurrentUrl().toLowerCase();
// 检查页面是否包含验证码相关关键词
boolean hasCaptchaKeywords = pageSource.contains("captcha") ||
pageSource.contains("recaptcha") ||
pageSource.contains("人机验证") ||
pageSource.contains("异常流量") ||
pageSource.contains("security check") ||
pageSource.contains("sorry/index") ||
pageSource.contains("before we can serve your request");
// 检查URL是否为验证码页面
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
currentUrl.contains("security-check") ||
currentUrl.contains("captcha");
return hasCaptchaKeywords || isCaptchaUrl;
} catch (Exception e) {
log.error("检查验证码页面时出错", e);
return false;
}
}
// 测试方法
public static void main(String[] args) {
// 测试用例搜索关键词查找特定网站的排名
int rank1 = getGoogleRank("大卡车", "zh.wikipedia.org");
System.out.println("维基百科排名:" + formatRank(rank1));
}
// 辅助方法格式化排名输出
private static String formatRank(int rank) {
if (rank > 0) return rank + "";
else if (rank == -1) return "未找到";
else if (rank == -2) return "遇到验证码";
else return "查询失败";
}
}