This commit is contained in:
Vinjor 2025-09-11 10:34:45 +08:00
parent fe1f3264fa
commit 20aabdf5e8
5 changed files with 410 additions and 157 deletions

View File

@ -7,10 +7,14 @@ import cn.hutool.core.date.DateUtil;
import com.alibaba.fastjson2.JSONObject; import com.alibaba.fastjson2.JSONObject;
import com.baomidou.mybatisplus.core.metadata.IPage; import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page; import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.ruoyi.base.domain.BaseSite;
import com.ruoyi.base.service.IBaseSiteInfoService;
import com.ruoyi.base.service.IBaseSiteService;
import com.ruoyi.busi.domain.BusiKeyword; import com.ruoyi.busi.domain.BusiKeyword;
import com.ruoyi.busi.service.IBusiKeywordService; import com.ruoyi.busi.service.IBusiKeywordService;
import com.ruoyi.busi.vo.BusiKeywordItemQueryVO; import com.ruoyi.busi.vo.BusiKeywordItemQueryVO;
import com.ruoyi.busi.vo.BusiKeywordRankStatVO; import com.ruoyi.busi.vo.BusiKeywordRankStatVO;
import com.ruoyi.common.utils.GoogleUtil;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
@ -31,6 +35,8 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl<BusiKeywordItemMappe
private BusiKeywordItemMapper busiKeywordItemMapper; private BusiKeywordItemMapper busiKeywordItemMapper;
@Autowired @Autowired
private IBusiKeywordService busiKeywordService; private IBusiKeywordService busiKeywordService;
@Autowired
private IBaseSiteService baseSiteService;
@Override @Override
public IPage<BusiKeywordItem> queryListPage(BusiKeywordItem pageReqVO, Page<BusiKeywordItem> page) { public IPage<BusiKeywordItem> queryListPage(BusiKeywordItem pageReqVO, Page<BusiKeywordItem> page) {
@ -46,6 +52,9 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl<BusiKeywordItemMappe
@Override @Override
public void getKeywordRanking() { public void getKeywordRanking() {
Date nowDate = new Date(); Date nowDate = new Date();
//所有网站
List<BaseSite> siteList = baseSiteService.list();
Map<String,String> siteMap = siteList.stream().collect(Collectors.toMap(BaseSite::getId, BaseSite::getSiteUrl));
List<BusiKeyword> keywordList = busiKeywordService.list(); List<BusiKeyword> keywordList = busiKeywordService.list();
//删除今日所有的排名数据 //删除今日所有的排名数据
busiKeywordItemMapper.deleteBySelectDateInt(DateUtil.formatDate(nowDate)); busiKeywordItemMapper.deleteBySelectDateInt(DateUtil.formatDate(nowDate));
@ -54,8 +63,12 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl<BusiKeywordItemMappe
BusiKeywordItem busiKeywordItem = new BusiKeywordItem(); BusiKeywordItem busiKeywordItem = new BusiKeywordItem();
busiKeywordItem.setTitle(keyword.getId()); busiKeywordItem.setTitle(keyword.getId());
busiKeywordItem.setSelectDate(nowDate); busiKeywordItem.setSelectDate(nowDate);
//TODO 调朱哥的接口获取排名 int ranking = GoogleUtil.getGoogleRank(keyword.getId(), siteMap.get(keyword.getTenantId()));
busiKeywordItem.setRanking(1); if(ranking>0 && ranking<=20){
busiKeywordItem.setRanking(ranking);
}else{
busiKeywordItem.setRanking(0);
}
busiKeywordItem.setTenantId(keyword.getTenantId()); busiKeywordItem.setTenantId(keyword.getTenantId());
insertList.add(busiKeywordItem); insertList.add(busiKeywordItem);
} }

View File

@ -4,16 +4,16 @@ spring:
type: com.alibaba.druid.pool.DruidDataSource type: com.alibaba.druid.pool.DruidDataSource
driverClassName: com.mysql.cj.jdbc.Driver driverClassName: com.mysql.cj.jdbc.Driver
druid: druid:
# 主库数据源-点亮开发库 # # 主库数据源-点亮开发库
master:
url: jdbc:mysql://82.156.161.160:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
username: site
password: 123456
#主库数据源-客户测试服务器
# master: # master:
# url: jdbc:mysql://127.0.0.1:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8 # url: jdbc:mysql://82.156.161.160:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
# username: site # username: site
# password: 123456 # password: 123456
# 主库数据源-客户测试服务器
master:
url: jdbc:mysql://1.92.99.15:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
username: site
password: 123456
# 从库数据源 # 从库数据源
slave: slave:
# 从数据源开关/默认关闭 # 从数据源开关/默认关闭

View File

@ -13,29 +13,33 @@ import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.chrome.ChromeOptions;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.springframework.stereotype.Component;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.net.URLDecoder;
import java.net.URL;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.HashMap;
import java.util.Map;
import java.nio.file.Files;
import java.nio.file.Paths;
public class GoogleUtil { public class GoogleUtil {
private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class); private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class);
// 可配置参数Ruoyi项目建议用@Value从application.yml读取 // 行为开关是否保留浏览器窗口默认为 false可在运行时通过 setter 修改
private static volatile boolean KEEP_BROWSER_OPEN = false;
// 可配置参数
private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q="; private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q=";
private static final String PROXY_HOST = "127.0.0.1";
private static final int PROXY_PORT = 7897;
private static final int CONNECT_TIMEOUT = 10000; private static final int CONNECT_TIMEOUT = 10000;
private static final int READ_TIMEOUT = 15000; private static final int READ_TIMEOUT = 15000;
// 重试机制
private static final Retryer<Boolean> RETRYER = RetryerBuilder.<Boolean>newBuilder()
.retryIfExceptionOfType(Exception.class)
.retryIfResult(result -> result != null && !result)
.withStopStrategy(StopStrategies.stopAfterAttempt(3))
.withWaitStrategy(WaitStrategies.fixedWait(1, TimeUnit.SECONDS))
.build();
// 扩展User-Agent池增加移动端标识反爬更友好 // 扩展User-Agent池增加移动端标识反爬更友好
private static final List<String> USER_AGENTS = Arrays.asList( private static final List<String> USER_AGENTS = Arrays.asList(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
@ -45,6 +49,21 @@ public class GoogleUtil {
"Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0" "Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0"
); );
public static void setKeepBrowserOpen(boolean keep) {
KEEP_BROWSER_OPEN = keep;
}
public static boolean isKeepBrowserOpen() {
return KEEP_BROWSER_OPEN;
}
// 是否要求页面 UI 显示网页/全部筛选默认开启若未检测到则跳过本次采集
private static volatile boolean REQUIRE_WEB_TAB = true;
public static void setRequireWebTab(boolean require) {
REQUIRE_WEB_TAB = require;
}
/** /**
* 核心方法获取Google搜索排名使用Selenium实现 * 核心方法获取Google搜索排名使用Selenium实现
*/ */
@ -88,6 +107,11 @@ public class GoogleUtil {
return 0; return 0;
} }
String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText; String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText;
// 允许指定 gl地域
String gl = System.getenv("GOOGLE_GL");
if (gl != null && !gl.trim().isEmpty()) {
finalUrl = finalUrl + "&gl=" + gl.trim();
}
log.info("发起Google搜索请求url={}, 目标网站={}", finalUrl, webSite); log.info("发起Google搜索请求url={}, 目标网站={}", finalUrl, webSite);
WebDriver driver = null; WebDriver driver = null;
@ -96,16 +120,57 @@ public class GoogleUtil {
driver = createWebDriver(); driver = createWebDriver();
int rank = -1; int rank = -1;
// 搜索前三页 // 仅搜索前2页前20名
for (int page = 1; page <= 3; page++) { for (int page = 1; page <= 2; page++) {
String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10); String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10);
// 统一参数关闭个性化语言仅网页结果udm=14
pageUrl += "&hl=zh-CN&pws=0&num=10&udm=14";
log.info("搜索第{}页: {}", page, pageUrl); log.info("搜索第{}页: {}", page, pageUrl);
// 先访问 NCR 以避免地区重定向
try {
driver.get("https://www.google.com/ncr");
} catch (Exception ignore) {
}
// 访问后立刻检查是否跳转到人机验证/异常流量页面
try {
String cur = driver.getCurrentUrl().toLowerCase();
if (cur.contains("/sorry/") || cur.contains("sorry/index") || cur.contains("security-check") || cur.contains("captcha")) {
log.warn("发现跳转至验证页面: {}", cur);
return -2;
}
} catch (Exception e) {
log.warn("获取当前URL失败可能窗口被关闭或被拦截: {}", e.getMessage());
return -2;
}
// 访问Google搜索页面 // 访问Google搜索页面
driver.get(pageUrl); driver.get(pageUrl);
// 随机延时模拟人类行为增加延时以减少被识别为机器人的可能性 // 显式等待搜索结果区域标题出现避免未加载完全
Thread.sleep((long) (3000 + Math.random() * 5000)); try {
WebDriverWait wait = new WebDriverWait(driver, 10);
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div#search")));
// 等待有机结果容器
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div#search div.MjjYud div.tF2Cxc, div#search a h3")));
} catch (Exception e) {
log.warn("等待搜索结果元素出现超时: {}", e.getMessage());
}
// 人类化随机等待
Thread.sleep((long) (1500 + Math.random() * 3000));
// 轻微滚动模拟浏览
try {
org.openqa.selenium.JavascriptExecutor js = (org.openqa.selenium.JavascriptExecutor) driver;
long steps = 2 + (int) (Math.random() * 3);
for (int i = 0; i < steps; i++) {
js.executeScript("window.scrollBy(0, arguments[0]);", 200 + (int) (Math.random() * 400));
Thread.sleep((long) (400 + Math.random() * 600));
}
} catch (Exception ignore) {
}
// 检查是否出现了验证码页面 // 检查是否出现了验证码页面
if (isCaptchaPage(driver)) { if (isCaptchaPage(driver)) {
@ -117,19 +182,40 @@ public class GoogleUtil {
String html = driver.getPageSource(); String html = driver.getPageSource();
log.debug("第{}页请求成功:响应长度={}字节", page, html.length()); log.debug("第{}页请求成功:响应长度={}字节", page, html.length());
// 如需页面必须显示网页/全部筛选标签则检查 UI否则跳过采集
if (REQUIRE_WEB_TAB) {
try {
String curUrl = null;
try { curUrl = driver.getCurrentUrl(); } catch (Exception ignore) {}
boolean hasWebTab = (curUrl != null && curUrl.contains("udm=14"))
|| html.contains(">网页<") || html.contains(">全部<")
|| html.contains(">Web<") || html.contains(">All<");
if (!hasWebTab) {
log.warn("未检测到‘网页/全部筛选标签跳过本次采集REQUIRE_WEB_TAB=true");
return 0;
}
} catch (Exception ignore) {
}
}
// 解析当前页的排名 // 解析当前页的排名
int pageRank = parseRankFromHtml(html, webSite, (page - 1) * 10); int baseRankForPage = (page - 1) * 10;
int pageRank = parseRankFromHtml(html, webSite, baseRankForPage);
// 写入本页有机结果清单仅用于排查
try {
dumpOrganicList(Jsoup.parse(html), baseRankForPage);
} catch (Exception ignore) {
}
if (pageRank > 0) { if (pageRank > 0) {
rank = pageRank; rank = pageRank;
break; break;
} }
// 页面间增加随机延时 // 页面间增加随机延时
if (page < 3) { if (page < 2) {
Thread.sleep((long) (2000 + Math.random() * 3000)); Thread.sleep((long) (2000 + Math.random() * 3000));
} }
} }
return rank; return rank;
} catch (Exception e) { } catch (Exception e) {
log.error("获取排名异常url={}, 原因={}", finalUrl, e.getMessage(), e); log.error("获取排名异常url={}, 原因={}", finalUrl, e.getMessage(), e);
@ -137,59 +223,93 @@ public class GoogleUtil {
} finally { } finally {
// 关闭浏览器 // 关闭浏览器
if (driver != null) { if (driver != null) {
if (!KEEP_BROWSER_OPEN) {
try { try {
driver.quit(); driver.quit();
} catch (Exception e) { } catch (Exception e) {
log.error("关闭浏览器失败", e); log.error("关闭浏览器失败", e);
} }
} else {
log.info("根据配置保留浏览器窗口开启KEEP_BROWSER_OPEN=true");
}
} }
} }
} }
/** /**
* 创建WebDriver实例 * 创建WebDriver实例
*
* @return WebDriver * @return WebDriver
*/ */
private static WebDriver createWebDriver() { private static WebDriver createWebDriver() {
// 设置ChromeDriver路径根据实际路径调整 // 优先使用 WebDriverManager 自动管理驱动
System.setProperty("webdriver.chrome.driver", "D:/chromedriver-win64/chromedriver.exe"); try {
WebDriverManager.chromedriver().setup();
log.info("WebDriverManager 已自动配置 chromedriver");
} catch (Throwable t) {
// 允许失败后使用手动路径
log.warn("WebDriverManager 配置失败,尝试使用手动路径: {}", t.getMessage());
String sysProp = System.getProperty("webdriver.chrome.driver");
String envProp = System.getenv("CHROMEDRIVER_PATH");
if (sysProp == null || sysProp.isEmpty()) {
String path = (envProp != null && !envProp.isEmpty()) ? envProp : "D:/chromedriver.exe";
System.setProperty("webdriver.chrome.driver", path);
log.info("使用ChromeDriver路径: {}", path);
} else {
log.info("检测到系统属性webdriver.chrome.driver: {}", sysProp);
}
}
ChromeOptions options = new ChromeOptions(); ChromeOptions options = new ChromeOptions();
// 设置无头模式 (Selenium 3.x语法) // 按环境变量决定是否 headless默认有头更像真实用户
options.addArguments("--headless"); boolean headless = Boolean.parseBoolean(System.getenv().getOrDefault("GOOGLE_HEADLESS", "false"));
// 设置User-Agent if (headless) options.addArguments("--headless=new");
// 语言与用户体验设置
options.addArguments("--lang=zh-CN");
Map<String, Object> prefs = new HashMap<>();
prefs.put("intl.accept_languages", "zh-CN,zh;q=0.9");
options.setExperimentalOption("prefs", prefs);
// User-Agent 随机化
String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size())); String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
options.addArguments("--user-agent=" + randomUserAgent); options.addArguments("--user-agent=" + randomUserAgent);
// 禁用图片加载提高速度
options.addArguments("--blink-settings=imagesEnabled=false"); // 真实窗口大小避免固定指纹
// 设置窗口大小 int width = 1024 + (int) (Math.random() * 600);
options.addArguments("--window-size=1920,1080"); int height = 700 + (int) (Math.random() * 500);
// 禁用自动化控制特征 options.addArguments("--window-size=" + width + "," + height);
// 尽量减少自动化特征
options.addArguments("--disable-blink-features=AutomationControlled"); options.addArguments("--disable-blink-features=AutomationControlled");
options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
// 禁用自动化标志
options.addArguments("--disable-extensions"); options.addArguments("--disable-extensions");
options.addArguments("--no-sandbox"); options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage"); options.addArguments("--disable-dev-shm-usage");
// 禁用SSL错误
options.addArguments("--ignore-ssl-errors"); options.addArguments("--ignore-ssl-errors");
options.addArguments("--ignore-certificate-errors"); options.addArguments("--ignore-certificate-errors");
// 禁用日志
options.addArguments("--log-level=3");
options.addArguments("--silent");
// 如果需要使用代理轮换代理IP可以有效避免验证码 // 复用用户数据目录可降低验证码概率
/* String userDataDir = System.getenv("GOOGLE_CHROME_USER_DATA_DIR");
if (PROXY_HOST != null && !PROXY_HOST.isEmpty()) { if (userDataDir != null && !userDataDir.trim().isEmpty()) {
// 可以在这里集成代理IP服务每次使用不同的IP options.addArguments("--user-data-dir=" + userDataDir.trim());
options.addArguments("--proxy-server=http://" + PROXY_HOST + ":" + PROXY_PORT);
} }
*/
// 根据需求保留浏览器窗口以静态变量为准
boolean detach = KEEP_BROWSER_OPEN; // 当需要保留窗口时开启 detach
options.setExperimentalOption("detach", detach);
if (detach) {
log.info("ChromeOptions 已启用 detach={}, 浏览器可能在会话结束后保持打开", detach);
}
WebDriver driver = new ChromeDriver(options); WebDriver driver = new ChromeDriver(options);
// 执行JavaScript隐藏webdriver属性 // 隐藏 webdriver 标志
try {
((org.openqa.selenium.JavascriptExecutor) driver).executeScript( ((org.openqa.selenium.JavascriptExecutor) driver).executeScript(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"); "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
} catch (Exception ignore) {
}
return driver; return driver;
} }
@ -206,137 +326,202 @@ public class GoogleUtil {
} }
} }
/**
* 将当前页面识别到的有机结果已过滤 PAA按序写入临时文件便于对照
*/
private static void dumpOrganicList(Document doc, int baseRank) {
try {
List<String> lines = new java.util.ArrayList<>();
java.util.Set<String> printed = new java.util.HashSet<>();
int idx = 0;
// 优先使用 tF2Cxc 容器
Elements tfItems = doc.select("div.tF2Cxc");
for (Element item : tfItems) {
// 仅基于 tF2Cxc 容器udm=14 已筛选网页通常无 PAA 干扰
Element a = item.selectFirst(".yuRUbf a[href]");
if (a == null) continue;
String href = a.attr("href");
// 展开 /url?q=
if (href != null && href.startsWith("/url?")) {
String query = href.substring(href.indexOf('?') + 1);
for (String pair : query.split("&")) {
int idxEq = pair.indexOf('=');
if (idxEq > 0) {
String key = pair.substring(0, idxEq);
String val = pair.substring(idxEq + 1);
if ("q".equals(key)) {
try {
href = URLDecoder.decode(val, StandardCharsets.UTF_8.name());
} catch (Exception ignore) {
}
break;
}
}
}
}
String hrefHost = extractHost(href);
String citeHost = extractDisplayHostFromItem(item);
String effHost = (hrefHost != null && !hrefHost.isEmpty()) ? hrefHost : citeHost;
if (effHost == null || effHost.isEmpty()) continue;
if (effHost.endsWith("google.com") || effHost.endsWith("google.cn") || effHost.contains("google."))
continue;
String key = (effHost == null ? "" : effHost) + "|" + (href == null ? "" : href);
if (!printed.add(key)) continue;
idx++;
int rank = baseRank + idx;
String line = String.format("#%d citeHost=%s hrefHost=%s href=%s", rank, citeHost, hrefHost, href);
lines.add(line);
log.info("本页有机{}: {}", idx, line);
if (idx >= 10) break;
}
// 兜底如不足10条回退到 a h3 结构补足
if (idx < 10) {
// 保持首页清单完整性的同时尽量避免 fallback 误入非有机卡片若你不需要 fallback 可以将本块去掉
}
log.info("本页有机共{}条", idx);
} catch (Exception e) {
log.warn("写入有机结果清单失败: {}", e.getMessage());
}
}
/** /**
* 解析排名 * 解析排名
*/ */
private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) { private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) {
Document doc = Jsoup.parse(html); Document doc = Jsoup.parse(html);
// 获取所有可能的搜索结果容器 // 优先按有机结果容器顺序计数广义抓取 #search 下的 tF2Cxc再在循环内过滤 PAAWt5Tfe/related-question-pair
Elements resultContainers = doc.select("div.g"); Elements items = doc.select("div.tF2Cxc");
String cleanTarget = normalizeHost(targetWebSite);
if (resultContainers.isEmpty()) {
log.warn("未找到任何搜索结果容器");
return -1;
}
log.debug("找到 {} 个搜索结果容器", resultContainers.size());
String cleanTarget = targetWebSite
.replace("https://", "")
.replace("http://", "")
.replace("www.", "");
// 如果目标网站包含路径则只取域名部分进行比较
if (cleanTarget.contains("/")) {
cleanTarget = cleanTarget.substring(0, cleanTarget.indexOf("/"));
}
int validResultCount = 0; int validResultCount = 0;
java.util.Set<String> counted = new java.util.HashSet<>();
if (!items.isEmpty()) {
for (Element item : items) {
// 仅基于 tF2Cxc 容器udm=14 已筛选网页通常无 PAA 干扰
for (Element container : resultContainers) { Element a = item.selectFirst(".yuRUbf a[href]");
// 跳过图片和视频区块 if (a == null) continue;
if (container.select("div.XaIwc, div.islrc, div.JTuIPc, div.PiKbc, div[jscontroller]").size() > 0) { String href = a.attr("href");
continue; if (href == null || href.isEmpty()) continue;
// 展开 /url?q= 形式
if (href.startsWith("/url?")) {
String query = href.substring(href.indexOf('?') + 1);
for (String pair : query.split("&")) {
int idx = pair.indexOf('=');
if (idx > 0) {
String key = pair.substring(0, idx);
String val = pair.substring(idx + 1);
if ("q".equals(key)) {
try {
href = URLDecoder.decode(val, StandardCharsets.UTF_8.name());
} catch (Exception ignore) {
} }
// 查找容器中的链接
Elements links = container.select("a[href]");
if (links.isEmpty()) {
continue;
}
boolean isSearchResult = false;
String resultUrl = "";
// 查找有效的搜索结果链接
for (Element link : links) {
String href = link.attr("href");
// 跳过Google内部链接
if (href.contains("google.com") || href.startsWith("/url?") || href.startsWith("#")) {
continue;
}
// 检查是否是标题链接通常在h3标签内
if (link.parent() != null && link.parent().tagName().equals("h3")) {
isSearchResult = true;
resultUrl = href;
break;
}
// 或者检查是否在常见的搜索结果区块中
Element parent = link.parent();
while (parent != null) {
if (parent.hasClass("yuRUbf") || parent.hasClass("tF2Cxc")) {
isSearchResult = true;
resultUrl = href;
break;
}
parent = parent.parent();
}
if (isSearchResult) {
break; break;
} }
} }
}
if (!isSearchResult) {
continue;
} }
String host = extractHost(href);
String citeHost = extractDisplayHostFromItem(item);
if (citeHost == null || citeHost.isEmpty()) citeHost = host;
if (host == null || host.isEmpty()) host = citeHost;
if (host == null || host.isEmpty()) continue;
if (host.endsWith("google.com") || host.endsWith("google.cn") || host.contains("google.")) {
continue;
}
String key = (host == null ? "" : host) + "|" + (href == null ? "" : href);
if (!counted.add(key)) continue; // 去重确保与dump一致
validResultCount++; validResultCount++;
if (host.equalsIgnoreCase(cleanTarget) || citeHost.equalsIgnoreCase(cleanTarget) ||
String cleanResult = resultUrl host.endsWith("." + cleanTarget) || citeHost.endsWith("." + cleanTarget) ||
.replace("https://", "") cleanTarget.endsWith("." + host) || cleanTarget.endsWith("." + citeHost)) {
.replace("http://", "")
.replace("www.", "");
// 如果结果URL包含路径则只取域名部分进行比较
if (cleanResult.contains("/")) {
int firstSlash = cleanResult.indexOf("/");
if (firstSlash > 0) {
cleanResult = cleanResult.substring(0, firstSlash);
}
}
// 匹配目标网站
if (cleanResult.contains(cleanTarget) || cleanTarget.contains(cleanResult)) {
int actualRank = baseRank + validResultCount; int actualRank = baseRank + validResultCount;
log.info("找到目标网站:排名={}, 结果URL={}, 目标网站={}", actualRank, resultUrl, targetWebSite); log.info("找到目标网站:排名={}, URL={}, host={}, citeHost={}, target={}", actualRank, href, host, citeHost, cleanTarget);
return actualRank; return actualRank;
} }
} }
}
// 不使用 h3 兜底机制避免引入非有机卡片造成误差仅以 tF2Cxc 且非 PAA 的结果计数
log.info("在当前页面未找到目标网站targetWebSite={}", targetWebSite); log.info("在当前页面未找到目标网站targetWebSite={}", targetWebSite);
return -1; return -1;
} }
/**
* 从有机结果容器中提取展示域名来自 .byrV5b > cite 的可见链接
*/
private static String extractDisplayHostFromItem(Element item) {
try {
// 选择第一个 PAA 折叠容器(jsname=YrZdPb) cite 作为展示域名
Elements cites = item.select(".byrV5b cite");
Element cite = null;
for (Element c : cites) {
if (c.parents().select("[jsname=YrZdPb]").isEmpty()) {
cite = c;
break;
}
}
if (cite == null) return "";
String txt = cite.text(); // 例如https://zh.wikipedia.org zh-cn 大型貨車
if (txt == null || txt.isEmpty()) return "";
// 仅取 URL 主体部分丢弃 后面的路径
int idxArrow = txt.indexOf("");
if (idxArrow > 0) {
txt = txt.substring(0, idxArrow).trim();
}
// 去掉多余空格
txt = txt.replace(" ", "");
// 提取 host
if (txt.startsWith("http")) {
try {
URL u = new URL(txt);
return normalizeHost(u.getHost());
} catch (Exception ignore) {
}
}
// 如果不是完整URL比如直接显示 zh.wikipedia.org
return normalizeHost(txt);
} catch (Exception e) {
return "";
}
}
/** /**
* 检查是否为验证码页面 * 检查是否为验证码页面
*
* @param driver WebDriver实例 * @param driver WebDriver实例
* @return 是否为验证码页面 * @return 是否为验证码页面
*/ */
private static boolean isCaptchaPage(WebDriver driver) { private static boolean isCaptchaPage(WebDriver driver) {
try { try {
String pageSource = driver.getPageSource().toLowerCase(); String currentUrl = "";
String currentUrl = driver.getCurrentUrl().toLowerCase(); String pageSource = "";
try {
currentUrl = driver.getCurrentUrl().toLowerCase();
} catch (Exception ignore) {
}
try {
pageSource = driver.getPageSource().toLowerCase();
} catch (Exception ignore) {
}
// 检查页面是否包含验证码相关关键词 // URL 层面的强信号
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
currentUrl.contains("security-check") ||
currentUrl.contains("captcha");
// 页面文本层面的信号
boolean hasCaptchaKeywords = pageSource.contains("captcha") || boolean hasCaptchaKeywords = pageSource.contains("captcha") ||
pageSource.contains("recaptcha") || pageSource.contains("recaptcha") ||
pageSource.contains("人机验证") || pageSource.contains("人机验证") ||
pageSource.contains("异常流量") || pageSource.contains("异常流量") ||
pageSource.contains("security check") || pageSource.contains("security check") ||
pageSource.contains("sorry/index") ||
pageSource.contains("before we can serve your request"); pageSource.contains("before we can serve your request");
// 检查URL是否为验证码页面 return isCaptchaUrl || hasCaptchaKeywords;
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
currentUrl.contains("security-check") ||
currentUrl.contains("captcha");
return hasCaptchaKeywords || isCaptchaUrl;
} catch (Exception e) { } catch (Exception e) {
log.error("检查验证码页面时出错", e); log.error("检查验证码页面时出错", e);
return false; return false;
@ -350,6 +535,37 @@ public class GoogleUtil {
System.out.println("维基百科排名:" + formatRank(rank1)); System.out.println("维基百科排名:" + formatRank(rank1));
} }
/**
* 提取并规范化 host
*/
private static String extractHost(String urlStr) {
try {
// 绝对URL
if (urlStr.startsWith("http")) {
URL u = new URL(urlStr);
return normalizeHost(u.getHost());
}
// 可能是相对或协议相对
if (urlStr.startsWith("//")) {
URL u = new URL("https:" + urlStr);
return normalizeHost(u.getHost());
}
return null;
} catch (Exception e) {
return null;
}
}
private static String normalizeHost(String hostOrUrl) {
if (hostOrUrl == null || hostOrUrl.isEmpty()) return "";
String s = hostOrUrl.trim();
s = s.replace("https://", "").replace("http://", "");
if (s.contains("/")) s = s.substring(0, s.indexOf('/'));
if (s.startsWith("www.")) s = s.substring(4);
return s.toLowerCase();
}
// 辅助方法格式化排名输出 // 辅助方法格式化排名输出
private static String formatRank(int rank) { private static String formatRank(int rank) {
if (rank > 0) return rank + ""; if (rank > 0) return rank + "";

View File

@ -114,7 +114,7 @@
<!-- 添加或修改站点管理对话框 --> <!-- 添加或修改站点管理对话框 -->
<el-dialog :title="title" :visible.sync="open" width="500px" append-to-body> <el-dialog :title="title" :visible.sync="open" width="500px" append-to-body>
<el-form ref="form" :model="form" :rules="rules" label-width="80px"> <el-form ref="form" :model="form" :rules="rules" label-width="80px">
<el-form-item label="站点编码" prop="siteName"> <el-form-item label="站点编码" prop="id">
<el-input v-model="form.id" placeholder="请输入站点编码" /> <el-input v-model="form.id" placeholder="请输入站点编码" />
</el-form-item> </el-form-item>
<el-form-item label="站点名称" prop="siteName"> <el-form-item label="站点名称" prop="siteName">
@ -179,7 +179,15 @@ export default {
form: {}, form: {},
// //
rules: { rules: {
id: [
{ required: true, message: '请输入站点编码', trigger: 'blur' }
],
siteName: [
{ required: true, message: '请输入站点名称', trigger: 'blur' }
],
siteUrl: [
{ required: true, message: '请输入站点网址', trigger: 'blur' }
],
}, },
//id //id
dealId:null, dealId:null,

View File

@ -87,7 +87,12 @@
border border
:tree-props="{children: 'children', hasChildren: 'hasChildren'}" :tree-props="{children: 'children', hasChildren: 'hasChildren'}"
v-loading="loading" :data="categoryList" > v-loading="loading" :data="categoryList" >
<el-table-column label="栏目名称" align="left" prop="catgName" /> <el-table-column label="栏目名称" align="left" prop="catgName" >
<template slot-scope="scope" >
{{scope.row.catgName}}
<template v-if="checkIfAppCatg(scope.row.id)"><el-tag>APP</el-tag></template>
</template>
</el-table-column>
<el-table-column label="栏目级别" align="center" prop="catgLevel"> <el-table-column label="栏目级别" align="center" prop="catgLevel">
<template slot-scope="scope"> <template slot-scope="scope">
{{scope.row.catgLevel}}级栏目 {{scope.row.catgLevel}}级栏目
@ -161,7 +166,7 @@ import { listCategory,delCategory} from "@/api/busi/category";
export default { export default {
name: "Category", name: "Category",
components:{}, components:{},
dicts: ['category_type'], dicts: ['category_type','app_menu'],
data() { data() {
return { return {
// //
@ -233,7 +238,18 @@ export default {
const url = `https://www.cdtruck.com/OutOpen/AddEmailRecord?callback=result&fromEmail=alicesales@scdtrailer.com&pathPage=https://www.cdtruck.com/&typeid=F9&_=1751524873382` const url = `https://www.cdtruck.com/OutOpen/AddEmailRecord?callback=result&fromEmail=alicesales@scdtrailer.com&pathPage=https://www.cdtruck.com/&typeid=F9&_=1751524873382`
window.open(url, "_blank"); window.open(url, "_blank");
}, },
/**
* 判断是否APP栏目
*/
checkIfAppCatg(catgId){
let arra = this.dict.type.app_menu
for(let i=0;i<arra.length;i++){
if(arra[i].label == catgId){
return true
}
}
return false
},
/** 查询网站栏目列表 */ /** 查询网站栏目列表 */
getList() { getList() {
this.loading = true; this.loading = true;