富文本编辑器
This commit is contained in:
parent
a54657c88d
commit
09f1220ae3
@ -171,6 +171,42 @@
|
||||
<version>3.1.0</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<!-- Jsoup HTML解析 -->
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.17.2</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.github.rholder</groupId>
|
||||
<artifactId>guava-retrying</artifactId>
|
||||
<version>2.0.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Selenium WebDriver -->
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
<version>3.141.59</version> <!-- 使用这个版本 -->
|
||||
</dependency>
|
||||
|
||||
<!-- WebDriverManager -->
|
||||
<dependency>
|
||||
<groupId>io.github.bonigarcia</groupId>
|
||||
<artifactId>webdrivermanager</artifactId>
|
||||
<version>5.6.3</version>
|
||||
</dependency>
|
||||
|
||||
<!-- HttpClient dependencies for WebDriverManager -->
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents.client5</groupId>
|
||||
<artifactId>httpclient5</artifactId>
|
||||
<version>5.2.1</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
@ -0,0 +1,360 @@
|
||||
package com.ruoyi.common.utils;
|
||||
|
||||
import com.github.rholder.retry.Retryer;
|
||||
import com.github.rholder.retry.RetryerBuilder;
|
||||
import com.github.rholder.retry.StopStrategies;
|
||||
import com.github.rholder.retry.WaitStrategies;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.openqa.selenium.WebDriver;
|
||||
import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class GoogleUtil {
|
||||
private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class);
|
||||
// 可配置参数(Ruoyi项目建议用@Value从application.yml读取)
|
||||
private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q=";
|
||||
private static final String PROXY_HOST = "127.0.0.1";
|
||||
private static final int PROXY_PORT = 7897;
|
||||
private static final int CONNECT_TIMEOUT = 10000;
|
||||
private static final int READ_TIMEOUT = 15000;
|
||||
// 重试机制
|
||||
private static final Retryer<Boolean> RETRYER = RetryerBuilder.<Boolean>newBuilder()
|
||||
.retryIfExceptionOfType(Exception.class)
|
||||
.retryIfResult(result -> result != null && !result)
|
||||
.withStopStrategy(StopStrategies.stopAfterAttempt(3))
|
||||
.withWaitStrategy(WaitStrategies.fixedWait(1, TimeUnit.SECONDS))
|
||||
.build();
|
||||
|
||||
// 扩展User-Agent池(增加移动端标识,反爬更友好)
|
||||
private static final List<String> USER_AGENTS = Arrays.asList(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
|
||||
"Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0"
|
||||
);
|
||||
|
||||
/**
|
||||
* 核心方法:获取Google搜索排名(使用Selenium实现)
|
||||
*/
|
||||
public static int getGoogleRank(String searchText, String webSite) {
|
||||
// 入参校验
|
||||
if (searchText == null || searchText.trim().isEmpty() || webSite == null || webSite.trim().isEmpty()) {
|
||||
log.error("入参非法:searchText={}, webSite={}", searchText, webSite);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 最多重试3次
|
||||
int maxRetries = 3;
|
||||
for (int attempt = 1; attempt <= maxRetries; attempt++) {
|
||||
int result = getGoogleRankInternal(searchText, webSite);
|
||||
if (result != -2) { // 不是验证码错误
|
||||
return result;
|
||||
}
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
log.info("遇到验证码,{}秒后进行第{}次重试", 5 * attempt, attempt + 1);
|
||||
try {
|
||||
Thread.sleep(5000 * attempt); // 递增延迟重试
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.error("经过{}次尝试后仍然遇到验证码,搜索失败", maxRetries);
|
||||
return -2;
|
||||
}
|
||||
|
||||
/**
|
||||
* 内部方法:执行Google搜索排名获取
|
||||
*/
|
||||
private static int getGoogleRankInternal(String searchText, String webSite) {
|
||||
webSite = webSite.trim();
|
||||
String encodedSearchText = encodeSearchText(searchText);
|
||||
if (encodedSearchText == null) {
|
||||
return 0;
|
||||
}
|
||||
String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText;
|
||||
log.info("发起Google搜索请求:url={}, 目标网站={}", finalUrl, webSite);
|
||||
|
||||
WebDriver driver = null;
|
||||
try {
|
||||
// 初始化WebDriver
|
||||
driver = createWebDriver();
|
||||
|
||||
int rank = -1;
|
||||
// 搜索前三页
|
||||
for (int page = 1; page <= 3; page++) {
|
||||
String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10);
|
||||
log.info("搜索第{}页: {}", page, pageUrl);
|
||||
|
||||
// 访问Google搜索页面
|
||||
driver.get(pageUrl);
|
||||
|
||||
// 随机延时,模拟人类行为(增加延时以减少被识别为机器人的可能性)
|
||||
Thread.sleep((long) (3000 + Math.random() * 5000));
|
||||
|
||||
// 检查是否出现了验证码页面
|
||||
if (isCaptchaPage(driver)) {
|
||||
log.warn("检测到验证码页面,本次请求失败");
|
||||
return -2; // 特殊返回值表示遇到验证码
|
||||
}
|
||||
|
||||
// 获取页面源码
|
||||
String html = driver.getPageSource();
|
||||
log.debug("第{}页请求成功:响应长度={}字节", page, html.length());
|
||||
|
||||
// 解析当前页的排名
|
||||
int pageRank = parseRankFromHtml(html, webSite, (page - 1) * 10);
|
||||
if (pageRank > 0) {
|
||||
rank = pageRank;
|
||||
break;
|
||||
}
|
||||
|
||||
// 页面间增加随机延时
|
||||
if (page < 3) {
|
||||
Thread.sleep((long) (2000 + Math.random() * 3000));
|
||||
}
|
||||
}
|
||||
|
||||
return rank;
|
||||
} catch (Exception e) {
|
||||
log.error("获取排名异常:url={}, 原因={}", finalUrl, e.getMessage(), e);
|
||||
return 0;
|
||||
} finally {
|
||||
// 关闭浏览器
|
||||
if (driver != null) {
|
||||
try {
|
||||
driver.quit();
|
||||
} catch (Exception e) {
|
||||
log.error("关闭浏览器失败", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建WebDriver实例
|
||||
* @return WebDriver
|
||||
*/
|
||||
private static WebDriver createWebDriver() {
|
||||
// 设置ChromeDriver路径(根据实际路径调整)
|
||||
System.setProperty("webdriver.chrome.driver", "D:/chromedriver-win64/chromedriver.exe");
|
||||
|
||||
ChromeOptions options = new ChromeOptions();
|
||||
// 设置无头模式 (Selenium 3.x语法)
|
||||
options.addArguments("--headless");
|
||||
// 设置User-Agent
|
||||
String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
|
||||
options.addArguments("--user-agent=" + randomUserAgent);
|
||||
// 禁用图片加载提高速度
|
||||
options.addArguments("--blink-settings=imagesEnabled=false");
|
||||
// 设置窗口大小
|
||||
options.addArguments("--window-size=1920,1080");
|
||||
// 禁用自动化控制特征
|
||||
options.addArguments("--disable-blink-features=AutomationControlled");
|
||||
options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
|
||||
// 禁用自动化标志
|
||||
options.addArguments("--disable-extensions");
|
||||
options.addArguments("--no-sandbox");
|
||||
options.addArguments("--disable-dev-shm-usage");
|
||||
// 禁用SSL错误
|
||||
options.addArguments("--ignore-ssl-errors");
|
||||
options.addArguments("--ignore-certificate-errors");
|
||||
// 禁用日志
|
||||
options.addArguments("--log-level=3");
|
||||
options.addArguments("--silent");
|
||||
|
||||
// 如果需要使用代理(轮换代理IP可以有效避免验证码)
|
||||
/*
|
||||
if (PROXY_HOST != null && !PROXY_HOST.isEmpty()) {
|
||||
// 可以在这里集成代理IP服务,每次使用不同的IP
|
||||
options.addArguments("--proxy-server=http://" + PROXY_HOST + ":" + PROXY_PORT);
|
||||
}
|
||||
*/
|
||||
|
||||
WebDriver driver = new ChromeDriver(options);
|
||||
// 执行JavaScript隐藏webdriver属性
|
||||
((org.openqa.selenium.JavascriptExecutor) driver).executeScript(
|
||||
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
|
||||
|
||||
return driver;
|
||||
}
|
||||
|
||||
/**
|
||||
* 编码搜索关键词
|
||||
*/
|
||||
private static String encodeSearchText(String searchText) {
|
||||
try {
|
||||
return URLEncoder.encode(searchText.trim(), StandardCharsets.UTF_8.name());
|
||||
} catch (Exception e) {
|
||||
log.error("关键词编码失败:searchText={}, 编码格式={}", searchText, StandardCharsets.UTF_8.name(), e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析排名
|
||||
*/
|
||||
private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) {
|
||||
Document doc = Jsoup.parse(html);
|
||||
|
||||
// 获取所有可能的搜索结果容器
|
||||
Elements resultContainers = doc.select("div.g");
|
||||
|
||||
if (resultContainers.isEmpty()) {
|
||||
log.warn("未找到任何搜索结果容器");
|
||||
return -1;
|
||||
}
|
||||
|
||||
log.debug("找到 {} 个搜索结果容器", resultContainers.size());
|
||||
|
||||
String cleanTarget = targetWebSite
|
||||
.replace("https://", "")
|
||||
.replace("http://", "")
|
||||
.replace("www.", "");
|
||||
|
||||
// 如果目标网站包含路径,则只取域名部分进行比较
|
||||
if (cleanTarget.contains("/")) {
|
||||
cleanTarget = cleanTarget.substring(0, cleanTarget.indexOf("/"));
|
||||
}
|
||||
|
||||
int validResultCount = 0;
|
||||
|
||||
for (Element container : resultContainers) {
|
||||
// 跳过图片和视频区块
|
||||
if (container.select("div.XaIwc, div.islrc, div.JTuIPc, div.PiKbc, div[jscontroller]").size() > 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 查找容器中的链接
|
||||
Elements links = container.select("a[href]");
|
||||
if (links.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean isSearchResult = false;
|
||||
String resultUrl = "";
|
||||
|
||||
// 查找有效的搜索结果链接
|
||||
for (Element link : links) {
|
||||
String href = link.attr("href");
|
||||
|
||||
// 跳过Google内部链接
|
||||
if (href.contains("google.com") || href.startsWith("/url?") || href.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 检查是否是标题链接(通常在h3标签内)
|
||||
if (link.parent() != null && link.parent().tagName().equals("h3")) {
|
||||
isSearchResult = true;
|
||||
resultUrl = href;
|
||||
break;
|
||||
}
|
||||
|
||||
// 或者检查是否在常见的搜索结果区块中
|
||||
Element parent = link.parent();
|
||||
while (parent != null) {
|
||||
if (parent.hasClass("yuRUbf") || parent.hasClass("tF2Cxc")) {
|
||||
isSearchResult = true;
|
||||
resultUrl = href;
|
||||
break;
|
||||
}
|
||||
parent = parent.parent();
|
||||
}
|
||||
|
||||
if (isSearchResult) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isSearchResult) {
|
||||
continue;
|
||||
}
|
||||
|
||||
validResultCount++;
|
||||
|
||||
String cleanResult = resultUrl
|
||||
.replace("https://", "")
|
||||
.replace("http://", "")
|
||||
.replace("www.", "");
|
||||
|
||||
// 如果结果URL包含路径,则只取域名部分进行比较
|
||||
if (cleanResult.contains("/")) {
|
||||
int firstSlash = cleanResult.indexOf("/");
|
||||
if (firstSlash > 0) {
|
||||
cleanResult = cleanResult.substring(0, firstSlash);
|
||||
}
|
||||
}
|
||||
|
||||
// 匹配目标网站
|
||||
if (cleanResult.contains(cleanTarget) || cleanTarget.contains(cleanResult)) {
|
||||
int actualRank = baseRank + validResultCount;
|
||||
log.info("找到目标网站:排名={}, 结果URL={}, 目标网站={}", actualRank, resultUrl, targetWebSite);
|
||||
return actualRank;
|
||||
}
|
||||
}
|
||||
|
||||
log.info("在当前页面未找到目标网站:targetWebSite={}", targetWebSite);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查是否为验证码页面
|
||||
* @param driver WebDriver实例
|
||||
* @return 是否为验证码页面
|
||||
*/
|
||||
private static boolean isCaptchaPage(WebDriver driver) {
|
||||
try {
|
||||
String pageSource = driver.getPageSource().toLowerCase();
|
||||
String currentUrl = driver.getCurrentUrl().toLowerCase();
|
||||
|
||||
// 检查页面是否包含验证码相关关键词
|
||||
boolean hasCaptchaKeywords = pageSource.contains("captcha") ||
|
||||
pageSource.contains("recaptcha") ||
|
||||
pageSource.contains("人机验证") ||
|
||||
pageSource.contains("异常流量") ||
|
||||
pageSource.contains("security check") ||
|
||||
pageSource.contains("sorry/index") ||
|
||||
pageSource.contains("before we can serve your request");
|
||||
|
||||
// 检查URL是否为验证码页面
|
||||
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
|
||||
currentUrl.contains("security-check") ||
|
||||
currentUrl.contains("captcha");
|
||||
|
||||
return hasCaptchaKeywords || isCaptchaUrl;
|
||||
} catch (Exception e) {
|
||||
log.error("检查验证码页面时出错", e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// 测试方法
|
||||
public static void main(String[] args) {
|
||||
// 测试用例:搜索关键词,查找特定网站的排名
|
||||
int rank1 = getGoogleRank("大卡车", "zh.wikipedia.org");
|
||||
System.out.println("维基百科排名:" + formatRank(rank1));
|
||||
}
|
||||
|
||||
// 辅助方法:格式化排名输出
|
||||
private static String formatRank(int rank) {
|
||||
if (rank > 0) return rank + "名";
|
||||
else if (rank == -1) return "未找到";
|
||||
else if (rank == -2) return "遇到验证码";
|
||||
else return "查询失败";
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user