1
This commit is contained in:
parent
fe1f3264fa
commit
20aabdf5e8
@ -7,10 +7,14 @@ import cn.hutool.core.date.DateUtil;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.baomidou.mybatisplus.core.metadata.IPage;
|
||||
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
|
||||
import com.ruoyi.base.domain.BaseSite;
|
||||
import com.ruoyi.base.service.IBaseSiteInfoService;
|
||||
import com.ruoyi.base.service.IBaseSiteService;
|
||||
import com.ruoyi.busi.domain.BusiKeyword;
|
||||
import com.ruoyi.busi.service.IBusiKeywordService;
|
||||
import com.ruoyi.busi.vo.BusiKeywordItemQueryVO;
|
||||
import com.ruoyi.busi.vo.BusiKeywordRankStatVO;
|
||||
import com.ruoyi.common.utils.GoogleUtil;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
|
||||
@ -31,6 +35,8 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl<BusiKeywordItemMappe
|
||||
private BusiKeywordItemMapper busiKeywordItemMapper;
|
||||
@Autowired
|
||||
private IBusiKeywordService busiKeywordService;
|
||||
@Autowired
|
||||
private IBaseSiteService baseSiteService;
|
||||
|
||||
@Override
|
||||
public IPage<BusiKeywordItem> queryListPage(BusiKeywordItem pageReqVO, Page<BusiKeywordItem> page) {
|
||||
@ -46,6 +52,9 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl<BusiKeywordItemMappe
|
||||
@Override
|
||||
public void getKeywordRanking() {
|
||||
Date nowDate = new Date();
|
||||
//所有网站
|
||||
List<BaseSite> siteList = baseSiteService.list();
|
||||
Map<String,String> siteMap = siteList.stream().collect(Collectors.toMap(BaseSite::getId, BaseSite::getSiteUrl));
|
||||
List<BusiKeyword> keywordList = busiKeywordService.list();
|
||||
//删除今日所有的排名数据
|
||||
busiKeywordItemMapper.deleteBySelectDateInt(DateUtil.formatDate(nowDate));
|
||||
@ -54,8 +63,12 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl<BusiKeywordItemMappe
|
||||
BusiKeywordItem busiKeywordItem = new BusiKeywordItem();
|
||||
busiKeywordItem.setTitle(keyword.getId());
|
||||
busiKeywordItem.setSelectDate(nowDate);
|
||||
//TODO 调朱哥的接口获取排名
|
||||
busiKeywordItem.setRanking(1);
|
||||
int ranking = GoogleUtil.getGoogleRank(keyword.getId(), siteMap.get(keyword.getTenantId()));
|
||||
if(ranking>0 && ranking<=20){
|
||||
busiKeywordItem.setRanking(ranking);
|
||||
}else{
|
||||
busiKeywordItem.setRanking(0);
|
||||
}
|
||||
busiKeywordItem.setTenantId(keyword.getTenantId());
|
||||
insertList.add(busiKeywordItem);
|
||||
}
|
||||
|
||||
@ -4,16 +4,16 @@ spring:
|
||||
type: com.alibaba.druid.pool.DruidDataSource
|
||||
driverClassName: com.mysql.cj.jdbc.Driver
|
||||
druid:
|
||||
# 主库数据源-点亮开发库
|
||||
master:
|
||||
url: jdbc:mysql://82.156.161.160:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
|
||||
username: site
|
||||
password: 123456
|
||||
#主库数据源-客户测试服务器
|
||||
# # 主库数据源-点亮开发库
|
||||
# master:
|
||||
# url: jdbc:mysql://127.0.0.1:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
|
||||
# url: jdbc:mysql://82.156.161.160:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
|
||||
# username: site
|
||||
# password: 123456
|
||||
# 主库数据源-客户测试服务器
|
||||
master:
|
||||
url: jdbc:mysql://1.92.99.15:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8
|
||||
username: site
|
||||
password: 123456
|
||||
# 从库数据源
|
||||
slave:
|
||||
# 从数据源开关/默认关闭
|
||||
|
||||
@ -13,29 +13,33 @@ import org.openqa.selenium.chrome.ChromeDriver;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import io.github.bonigarcia.wdm.WebDriverManager;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||
import org.openqa.selenium.support.ui.ExpectedConditions;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
public class GoogleUtil {
|
||||
private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class);
|
||||
// 可配置参数(Ruoyi项目建议用@Value从application.yml读取)
|
||||
// 行为开关:是否保留浏览器窗口(默认为 false,可在运行时通过 setter 修改)
|
||||
private static volatile boolean KEEP_BROWSER_OPEN = false;
|
||||
// 可配置参数
|
||||
private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q=";
|
||||
private static final String PROXY_HOST = "127.0.0.1";
|
||||
private static final int PROXY_PORT = 7897;
|
||||
private static final int CONNECT_TIMEOUT = 10000;
|
||||
private static final int READ_TIMEOUT = 15000;
|
||||
// 重试机制
|
||||
private static final Retryer<Boolean> RETRYER = RetryerBuilder.<Boolean>newBuilder()
|
||||
.retryIfExceptionOfType(Exception.class)
|
||||
.retryIfResult(result -> result != null && !result)
|
||||
.withStopStrategy(StopStrategies.stopAfterAttempt(3))
|
||||
.withWaitStrategy(WaitStrategies.fixedWait(1, TimeUnit.SECONDS))
|
||||
.build();
|
||||
|
||||
// 扩展User-Agent池(增加移动端标识,反爬更友好)
|
||||
private static final List<String> USER_AGENTS = Arrays.asList(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
|
||||
@ -45,6 +49,21 @@ public class GoogleUtil {
|
||||
"Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0"
|
||||
);
|
||||
|
||||
public static void setKeepBrowserOpen(boolean keep) {
|
||||
KEEP_BROWSER_OPEN = keep;
|
||||
}
|
||||
|
||||
public static boolean isKeepBrowserOpen() {
|
||||
return KEEP_BROWSER_OPEN;
|
||||
}
|
||||
|
||||
// 是否要求页面 UI 显示“网页/全部”筛选(默认开启)。若未检测到则跳过本次采集。
|
||||
private static volatile boolean REQUIRE_WEB_TAB = true;
|
||||
|
||||
public static void setRequireWebTab(boolean require) {
|
||||
REQUIRE_WEB_TAB = require;
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心方法:获取Google搜索排名(使用Selenium实现)
|
||||
*/
|
||||
@ -88,6 +107,11 @@ public class GoogleUtil {
|
||||
return 0;
|
||||
}
|
||||
String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText;
|
||||
// 允许指定 gl(地域)
|
||||
String gl = System.getenv("GOOGLE_GL");
|
||||
if (gl != null && !gl.trim().isEmpty()) {
|
||||
finalUrl = finalUrl + "&gl=" + gl.trim();
|
||||
}
|
||||
log.info("发起Google搜索请求:url={}, 目标网站={}", finalUrl, webSite);
|
||||
|
||||
WebDriver driver = null;
|
||||
@ -96,16 +120,57 @@ public class GoogleUtil {
|
||||
driver = createWebDriver();
|
||||
|
||||
int rank = -1;
|
||||
// 搜索前三页
|
||||
for (int page = 1; page <= 3; page++) {
|
||||
// 仅搜索前2页(前20名)
|
||||
for (int page = 1; page <= 2; page++) {
|
||||
String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10);
|
||||
// 统一参数:关闭个性化、语言,仅网页结果(udm=14)
|
||||
pageUrl += "&hl=zh-CN&pws=0&num=10&udm=14";
|
||||
log.info("搜索第{}页: {}", page, pageUrl);
|
||||
|
||||
// 先访问 NCR 以避免地区重定向
|
||||
try {
|
||||
driver.get("https://www.google.com/ncr");
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
// 访问后立刻检查是否跳转到人机验证/异常流量页面
|
||||
try {
|
||||
String cur = driver.getCurrentUrl().toLowerCase();
|
||||
if (cur.contains("/sorry/") || cur.contains("sorry/index") || cur.contains("security-check") || cur.contains("captcha")) {
|
||||
log.warn("发现跳转至验证页面: {}", cur);
|
||||
return -2;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("获取当前URL失败,可能窗口被关闭或被拦截: {}", e.getMessage());
|
||||
return -2;
|
||||
}
|
||||
|
||||
|
||||
// 访问Google搜索页面
|
||||
driver.get(pageUrl);
|
||||
|
||||
// 随机延时,模拟人类行为(增加延时以减少被识别为机器人的可能性)
|
||||
Thread.sleep((long) (3000 + Math.random() * 5000));
|
||||
// 显式等待搜索结果区域、标题出现,避免未加载完全
|
||||
try {
|
||||
WebDriverWait wait = new WebDriverWait(driver, 10);
|
||||
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div#search")));
|
||||
// 等待有机结果容器
|
||||
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div#search div.MjjYud div.tF2Cxc, div#search a h3")));
|
||||
} catch (Exception e) {
|
||||
log.warn("等待搜索结果元素出现超时: {}", e.getMessage());
|
||||
}
|
||||
|
||||
// 人类化随机等待
|
||||
Thread.sleep((long) (1500 + Math.random() * 3000));
|
||||
|
||||
// 轻微滚动,模拟浏览
|
||||
try {
|
||||
org.openqa.selenium.JavascriptExecutor js = (org.openqa.selenium.JavascriptExecutor) driver;
|
||||
long steps = 2 + (int) (Math.random() * 3);
|
||||
for (int i = 0; i < steps; i++) {
|
||||
js.executeScript("window.scrollBy(0, arguments[0]);", 200 + (int) (Math.random() * 400));
|
||||
Thread.sleep((long) (400 + Math.random() * 600));
|
||||
}
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
// 检查是否出现了验证码页面
|
||||
if (isCaptchaPage(driver)) {
|
||||
@ -117,19 +182,40 @@ public class GoogleUtil {
|
||||
String html = driver.getPageSource();
|
||||
log.debug("第{}页请求成功:响应长度={}字节", page, html.length());
|
||||
|
||||
// 如需页面必须显示“网页/全部”筛选标签,则检查 UI,否则跳过采集
|
||||
if (REQUIRE_WEB_TAB) {
|
||||
try {
|
||||
String curUrl = null;
|
||||
try { curUrl = driver.getCurrentUrl(); } catch (Exception ignore) {}
|
||||
boolean hasWebTab = (curUrl != null && curUrl.contains("udm=14"))
|
||||
|| html.contains(">网页<") || html.contains(">全部<")
|
||||
|| html.contains(">Web<") || html.contains(">All<");
|
||||
if (!hasWebTab) {
|
||||
log.warn("未检测到‘网页/全部’筛选标签,跳过本次采集(REQUIRE_WEB_TAB=true)");
|
||||
return 0;
|
||||
}
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
}
|
||||
|
||||
// 解析当前页的排名
|
||||
int pageRank = parseRankFromHtml(html, webSite, (page - 1) * 10);
|
||||
int baseRankForPage = (page - 1) * 10;
|
||||
int pageRank = parseRankFromHtml(html, webSite, baseRankForPage);
|
||||
// 写入本页有机结果清单(仅用于排查)
|
||||
try {
|
||||
dumpOrganicList(Jsoup.parse(html), baseRankForPage);
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
if (pageRank > 0) {
|
||||
rank = pageRank;
|
||||
break;
|
||||
}
|
||||
|
||||
// 页面间增加随机延时
|
||||
if (page < 3) {
|
||||
if (page < 2) {
|
||||
Thread.sleep((long) (2000 + Math.random() * 3000));
|
||||
}
|
||||
}
|
||||
|
||||
return rank;
|
||||
} catch (Exception e) {
|
||||
log.error("获取排名异常:url={}, 原因={}", finalUrl, e.getMessage(), e);
|
||||
@ -137,59 +223,93 @@ public class GoogleUtil {
|
||||
} finally {
|
||||
// 关闭浏览器
|
||||
if (driver != null) {
|
||||
if (!KEEP_BROWSER_OPEN) {
|
||||
try {
|
||||
driver.quit();
|
||||
} catch (Exception e) {
|
||||
log.error("关闭浏览器失败", e);
|
||||
}
|
||||
} else {
|
||||
log.info("根据配置保留浏览器窗口开启(KEEP_BROWSER_OPEN=true)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建WebDriver实例
|
||||
*
|
||||
* @return WebDriver
|
||||
*/
|
||||
private static WebDriver createWebDriver() {
|
||||
// 设置ChromeDriver路径(根据实际路径调整)
|
||||
System.setProperty("webdriver.chrome.driver", "D:/chromedriver-win64/chromedriver.exe");
|
||||
// 优先使用 WebDriverManager 自动管理驱动
|
||||
try {
|
||||
WebDriverManager.chromedriver().setup();
|
||||
log.info("WebDriverManager 已自动配置 chromedriver");
|
||||
} catch (Throwable t) {
|
||||
// 允许失败后使用手动路径
|
||||
log.warn("WebDriverManager 配置失败,尝试使用手动路径: {}", t.getMessage());
|
||||
String sysProp = System.getProperty("webdriver.chrome.driver");
|
||||
String envProp = System.getenv("CHROMEDRIVER_PATH");
|
||||
if (sysProp == null || sysProp.isEmpty()) {
|
||||
String path = (envProp != null && !envProp.isEmpty()) ? envProp : "D:/chromedriver.exe";
|
||||
System.setProperty("webdriver.chrome.driver", path);
|
||||
log.info("使用ChromeDriver路径: {}", path);
|
||||
} else {
|
||||
log.info("检测到系统属性webdriver.chrome.driver: {}", sysProp);
|
||||
}
|
||||
}
|
||||
|
||||
ChromeOptions options = new ChromeOptions();
|
||||
// 设置无头模式 (Selenium 3.x语法)
|
||||
options.addArguments("--headless");
|
||||
// 设置User-Agent
|
||||
// 按环境变量决定是否 headless(默认有头,更像真实用户)
|
||||
boolean headless = Boolean.parseBoolean(System.getenv().getOrDefault("GOOGLE_HEADLESS", "false"));
|
||||
if (headless) options.addArguments("--headless=new");
|
||||
|
||||
// 语言与用户体验设置
|
||||
options.addArguments("--lang=zh-CN");
|
||||
Map<String, Object> prefs = new HashMap<>();
|
||||
prefs.put("intl.accept_languages", "zh-CN,zh;q=0.9");
|
||||
options.setExperimentalOption("prefs", prefs);
|
||||
|
||||
// User-Agent 随机化
|
||||
String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size()));
|
||||
options.addArguments("--user-agent=" + randomUserAgent);
|
||||
// 禁用图片加载提高速度
|
||||
options.addArguments("--blink-settings=imagesEnabled=false");
|
||||
// 设置窗口大小
|
||||
options.addArguments("--window-size=1920,1080");
|
||||
// 禁用自动化控制特征
|
||||
|
||||
// 真实窗口大小(避免固定指纹)
|
||||
int width = 1024 + (int) (Math.random() * 600);
|
||||
int height = 700 + (int) (Math.random() * 500);
|
||||
options.addArguments("--window-size=" + width + "," + height);
|
||||
|
||||
// 尽量减少自动化特征
|
||||
options.addArguments("--disable-blink-features=AutomationControlled");
|
||||
options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation"));
|
||||
// 禁用自动化标志
|
||||
options.addArguments("--disable-extensions");
|
||||
options.addArguments("--no-sandbox");
|
||||
options.addArguments("--disable-dev-shm-usage");
|
||||
// 禁用SSL错误
|
||||
options.addArguments("--ignore-ssl-errors");
|
||||
options.addArguments("--ignore-certificate-errors");
|
||||
// 禁用日志
|
||||
options.addArguments("--log-level=3");
|
||||
options.addArguments("--silent");
|
||||
|
||||
// 如果需要使用代理(轮换代理IP可以有效避免验证码)
|
||||
/*
|
||||
if (PROXY_HOST != null && !PROXY_HOST.isEmpty()) {
|
||||
// 可以在这里集成代理IP服务,每次使用不同的IP
|
||||
options.addArguments("--proxy-server=http://" + PROXY_HOST + ":" + PROXY_PORT);
|
||||
// 复用用户数据目录(可降低验证码概率)
|
||||
String userDataDir = System.getenv("GOOGLE_CHROME_USER_DATA_DIR");
|
||||
if (userDataDir != null && !userDataDir.trim().isEmpty()) {
|
||||
options.addArguments("--user-data-dir=" + userDataDir.trim());
|
||||
}
|
||||
*/
|
||||
|
||||
// 根据需求保留浏览器窗口(以静态变量为准)
|
||||
boolean detach = KEEP_BROWSER_OPEN; // 当需要保留窗口时,开启 detach
|
||||
options.setExperimentalOption("detach", detach);
|
||||
if (detach) {
|
||||
log.info("ChromeOptions 已启用 detach={}, 浏览器可能在会话结束后保持打开", detach);
|
||||
}
|
||||
|
||||
|
||||
WebDriver driver = new ChromeDriver(options);
|
||||
// 执行JavaScript隐藏webdriver属性
|
||||
// 隐藏 webdriver 标志
|
||||
try {
|
||||
((org.openqa.selenium.JavascriptExecutor) driver).executeScript(
|
||||
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})");
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
return driver;
|
||||
}
|
||||
@ -206,137 +326,202 @@ public class GoogleUtil {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将当前页面识别到的有机结果(已过滤 PAA)按序写入临时文件,便于对照
|
||||
*/
|
||||
private static void dumpOrganicList(Document doc, int baseRank) {
|
||||
try {
|
||||
List<String> lines = new java.util.ArrayList<>();
|
||||
java.util.Set<String> printed = new java.util.HashSet<>();
|
||||
int idx = 0;
|
||||
// 优先使用 tF2Cxc 容器
|
||||
Elements tfItems = doc.select("div.tF2Cxc");
|
||||
for (Element item : tfItems) {
|
||||
// 仅基于 tF2Cxc 容器(udm=14 已筛选网页,通常无 PAA 干扰)
|
||||
Element a = item.selectFirst(".yuRUbf a[href]");
|
||||
if (a == null) continue;
|
||||
String href = a.attr("href");
|
||||
// 展开 /url?q=
|
||||
if (href != null && href.startsWith("/url?")) {
|
||||
String query = href.substring(href.indexOf('?') + 1);
|
||||
for (String pair : query.split("&")) {
|
||||
int idxEq = pair.indexOf('=');
|
||||
if (idxEq > 0) {
|
||||
String key = pair.substring(0, idxEq);
|
||||
String val = pair.substring(idxEq + 1);
|
||||
if ("q".equals(key)) {
|
||||
try {
|
||||
href = URLDecoder.decode(val, StandardCharsets.UTF_8.name());
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
String hrefHost = extractHost(href);
|
||||
String citeHost = extractDisplayHostFromItem(item);
|
||||
String effHost = (hrefHost != null && !hrefHost.isEmpty()) ? hrefHost : citeHost;
|
||||
if (effHost == null || effHost.isEmpty()) continue;
|
||||
if (effHost.endsWith("google.com") || effHost.endsWith("google.cn") || effHost.contains("google."))
|
||||
continue;
|
||||
String key = (effHost == null ? "" : effHost) + "|" + (href == null ? "" : href);
|
||||
if (!printed.add(key)) continue;
|
||||
idx++;
|
||||
int rank = baseRank + idx;
|
||||
String line = String.format("#%d citeHost=%s hrefHost=%s href=%s", rank, citeHost, hrefHost, href);
|
||||
lines.add(line);
|
||||
log.info("本页有机{}: {}", idx, line);
|
||||
if (idx >= 10) break;
|
||||
}
|
||||
// 兜底:如不足10条,回退到 a h3 结构补足
|
||||
if (idx < 10) {
|
||||
// 保持首页清单完整性的同时,尽量避免 fallback 误入非有机卡片;若你不需要 fallback 可以将本块去掉
|
||||
}
|
||||
log.info("本页有机共{}条", idx);
|
||||
} catch (Exception e) {
|
||||
log.warn("写入有机结果清单失败: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析排名
|
||||
*/
|
||||
private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) {
|
||||
Document doc = Jsoup.parse(html);
|
||||
|
||||
// 获取所有可能的搜索结果容器
|
||||
Elements resultContainers = doc.select("div.g");
|
||||
|
||||
if (resultContainers.isEmpty()) {
|
||||
log.warn("未找到任何搜索结果容器");
|
||||
return -1;
|
||||
}
|
||||
|
||||
log.debug("找到 {} 个搜索结果容器", resultContainers.size());
|
||||
|
||||
String cleanTarget = targetWebSite
|
||||
.replace("https://", "")
|
||||
.replace("http://", "")
|
||||
.replace("www.", "");
|
||||
|
||||
// 如果目标网站包含路径,则只取域名部分进行比较
|
||||
if (cleanTarget.contains("/")) {
|
||||
cleanTarget = cleanTarget.substring(0, cleanTarget.indexOf("/"));
|
||||
}
|
||||
// 优先按有机结果容器顺序计数:广义抓取 #search 下的 tF2Cxc,再在循环内过滤 PAA(Wt5Tfe/related-question-pair)
|
||||
Elements items = doc.select("div.tF2Cxc");
|
||||
String cleanTarget = normalizeHost(targetWebSite);
|
||||
|
||||
int validResultCount = 0;
|
||||
java.util.Set<String> counted = new java.util.HashSet<>();
|
||||
if (!items.isEmpty()) {
|
||||
for (Element item : items) {
|
||||
// 仅基于 tF2Cxc 容器(udm=14 已筛选网页,通常无 PAA 干扰)
|
||||
|
||||
for (Element container : resultContainers) {
|
||||
// 跳过图片和视频区块
|
||||
if (container.select("div.XaIwc, div.islrc, div.JTuIPc, div.PiKbc, div[jscontroller]").size() > 0) {
|
||||
continue;
|
||||
Element a = item.selectFirst(".yuRUbf a[href]");
|
||||
if (a == null) continue;
|
||||
String href = a.attr("href");
|
||||
if (href == null || href.isEmpty()) continue;
|
||||
|
||||
// 展开 /url?q= 形式
|
||||
if (href.startsWith("/url?")) {
|
||||
String query = href.substring(href.indexOf('?') + 1);
|
||||
for (String pair : query.split("&")) {
|
||||
int idx = pair.indexOf('=');
|
||||
if (idx > 0) {
|
||||
String key = pair.substring(0, idx);
|
||||
String val = pair.substring(idx + 1);
|
||||
if ("q".equals(key)) {
|
||||
try {
|
||||
href = URLDecoder.decode(val, StandardCharsets.UTF_8.name());
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
// 查找容器中的链接
|
||||
Elements links = container.select("a[href]");
|
||||
if (links.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean isSearchResult = false;
|
||||
String resultUrl = "";
|
||||
|
||||
// 查找有效的搜索结果链接
|
||||
for (Element link : links) {
|
||||
String href = link.attr("href");
|
||||
|
||||
// 跳过Google内部链接
|
||||
if (href.contains("google.com") || href.startsWith("/url?") || href.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 检查是否是标题链接(通常在h3标签内)
|
||||
if (link.parent() != null && link.parent().tagName().equals("h3")) {
|
||||
isSearchResult = true;
|
||||
resultUrl = href;
|
||||
break;
|
||||
}
|
||||
|
||||
// 或者检查是否在常见的搜索结果区块中
|
||||
Element parent = link.parent();
|
||||
while (parent != null) {
|
||||
if (parent.hasClass("yuRUbf") || parent.hasClass("tF2Cxc")) {
|
||||
isSearchResult = true;
|
||||
resultUrl = href;
|
||||
break;
|
||||
}
|
||||
parent = parent.parent();
|
||||
}
|
||||
|
||||
if (isSearchResult) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isSearchResult) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
String host = extractHost(href);
|
||||
String citeHost = extractDisplayHostFromItem(item);
|
||||
if (citeHost == null || citeHost.isEmpty()) citeHost = host;
|
||||
if (host == null || host.isEmpty()) host = citeHost;
|
||||
if (host == null || host.isEmpty()) continue;
|
||||
if (host.endsWith("google.com") || host.endsWith("google.cn") || host.contains("google.")) {
|
||||
continue;
|
||||
}
|
||||
String key = (host == null ? "" : host) + "|" + (href == null ? "" : href);
|
||||
if (!counted.add(key)) continue; // 去重,确保与dump一致
|
||||
validResultCount++;
|
||||
|
||||
String cleanResult = resultUrl
|
||||
.replace("https://", "")
|
||||
.replace("http://", "")
|
||||
.replace("www.", "");
|
||||
|
||||
// 如果结果URL包含路径,则只取域名部分进行比较
|
||||
if (cleanResult.contains("/")) {
|
||||
int firstSlash = cleanResult.indexOf("/");
|
||||
if (firstSlash > 0) {
|
||||
cleanResult = cleanResult.substring(0, firstSlash);
|
||||
}
|
||||
}
|
||||
|
||||
// 匹配目标网站
|
||||
if (cleanResult.contains(cleanTarget) || cleanTarget.contains(cleanResult)) {
|
||||
if (host.equalsIgnoreCase(cleanTarget) || citeHost.equalsIgnoreCase(cleanTarget) ||
|
||||
host.endsWith("." + cleanTarget) || citeHost.endsWith("." + cleanTarget) ||
|
||||
cleanTarget.endsWith("." + host) || cleanTarget.endsWith("." + citeHost)) {
|
||||
int actualRank = baseRank + validResultCount;
|
||||
log.info("找到目标网站:排名={}, 结果URL={}, 目标网站={}", actualRank, resultUrl, targetWebSite);
|
||||
log.info("找到目标网站:排名={}, URL={}, host={}, citeHost={}, target={}", actualRank, href, host, citeHost, cleanTarget);
|
||||
return actualRank;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// 不使用 h3 兜底机制,避免引入非有机卡片造成误差;仅以 tF2Cxc 且非 PAA 的结果计数
|
||||
log.info("在当前页面未找到目标网站:targetWebSite={}", targetWebSite);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 从有机结果容器中提取展示域名(来自 .byrV5b > cite 的可见链接)
|
||||
*/
|
||||
private static String extractDisplayHostFromItem(Element item) {
|
||||
try {
|
||||
// 选择第一个“非 PAA 折叠容器(jsname=YrZdPb)内”的 cite 作为展示域名
|
||||
Elements cites = item.select(".byrV5b cite");
|
||||
Element cite = null;
|
||||
for (Element c : cites) {
|
||||
if (c.parents().select("[jsname=YrZdPb]").isEmpty()) {
|
||||
cite = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cite == null) return "";
|
||||
String txt = cite.text(); // 例如:https://zh.wikipedia.org › zh-cn › 大型貨車
|
||||
if (txt == null || txt.isEmpty()) return "";
|
||||
// 仅取 URL 主体部分,丢弃“ › ”后面的路径
|
||||
int idxArrow = txt.indexOf("›");
|
||||
if (idxArrow > 0) {
|
||||
txt = txt.substring(0, idxArrow).trim();
|
||||
}
|
||||
// 去掉多余空格
|
||||
txt = txt.replace(" ", "");
|
||||
// 提取 host
|
||||
if (txt.startsWith("http")) {
|
||||
try {
|
||||
URL u = new URL(txt);
|
||||
return normalizeHost(u.getHost());
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
}
|
||||
// 如果不是完整URL,比如直接显示 zh.wikipedia.org
|
||||
return normalizeHost(txt);
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查是否为验证码页面
|
||||
*
|
||||
* @param driver WebDriver实例
|
||||
* @return 是否为验证码页面
|
||||
*/
|
||||
private static boolean isCaptchaPage(WebDriver driver) {
|
||||
try {
|
||||
String pageSource = driver.getPageSource().toLowerCase();
|
||||
String currentUrl = driver.getCurrentUrl().toLowerCase();
|
||||
String currentUrl = "";
|
||||
String pageSource = "";
|
||||
try {
|
||||
currentUrl = driver.getCurrentUrl().toLowerCase();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
try {
|
||||
pageSource = driver.getPageSource().toLowerCase();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
|
||||
// 检查页面是否包含验证码相关关键词
|
||||
// URL 层面的强信号
|
||||
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
|
||||
currentUrl.contains("security-check") ||
|
||||
currentUrl.contains("captcha");
|
||||
|
||||
// 页面文本层面的信号
|
||||
boolean hasCaptchaKeywords = pageSource.contains("captcha") ||
|
||||
pageSource.contains("recaptcha") ||
|
||||
pageSource.contains("人机验证") ||
|
||||
pageSource.contains("异常流量") ||
|
||||
pageSource.contains("security check") ||
|
||||
pageSource.contains("sorry/index") ||
|
||||
pageSource.contains("before we can serve your request");
|
||||
|
||||
// 检查URL是否为验证码页面
|
||||
boolean isCaptchaUrl = currentUrl.contains("sorry/index") ||
|
||||
currentUrl.contains("security-check") ||
|
||||
currentUrl.contains("captcha");
|
||||
|
||||
return hasCaptchaKeywords || isCaptchaUrl;
|
||||
return isCaptchaUrl || hasCaptchaKeywords;
|
||||
} catch (Exception e) {
|
||||
log.error("检查验证码页面时出错", e);
|
||||
return false;
|
||||
@ -350,6 +535,37 @@ public class GoogleUtil {
|
||||
System.out.println("维基百科排名:" + formatRank(rank1));
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取并规范化 host
|
||||
*/
|
||||
private static String extractHost(String urlStr) {
|
||||
try {
|
||||
// 绝对URL
|
||||
if (urlStr.startsWith("http")) {
|
||||
URL u = new URL(urlStr);
|
||||
return normalizeHost(u.getHost());
|
||||
}
|
||||
// 可能是相对或协议相对
|
||||
if (urlStr.startsWith("//")) {
|
||||
URL u = new URL("https:" + urlStr);
|
||||
return normalizeHost(u.getHost());
|
||||
}
|
||||
return null;
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static String normalizeHost(String hostOrUrl) {
|
||||
if (hostOrUrl == null || hostOrUrl.isEmpty()) return "";
|
||||
String s = hostOrUrl.trim();
|
||||
s = s.replace("https://", "").replace("http://", "");
|
||||
if (s.contains("/")) s = s.substring(0, s.indexOf('/'));
|
||||
if (s.startsWith("www.")) s = s.substring(4);
|
||||
return s.toLowerCase();
|
||||
}
|
||||
|
||||
|
||||
// 辅助方法:格式化排名输出
|
||||
private static String formatRank(int rank) {
|
||||
if (rank > 0) return rank + "名";
|
||||
|
||||
@ -114,7 +114,7 @@
|
||||
<!-- 添加或修改站点管理对话框 -->
|
||||
<el-dialog :title="title" :visible.sync="open" width="500px" append-to-body>
|
||||
<el-form ref="form" :model="form" :rules="rules" label-width="80px">
|
||||
<el-form-item label="站点编码" prop="siteName">
|
||||
<el-form-item label="站点编码" prop="id">
|
||||
<el-input v-model="form.id" placeholder="请输入站点编码" />
|
||||
</el-form-item>
|
||||
<el-form-item label="站点名称" prop="siteName">
|
||||
@ -179,7 +179,15 @@ export default {
|
||||
form: {},
|
||||
// 表单校验
|
||||
rules: {
|
||||
|
||||
id: [
|
||||
{ required: true, message: '请输入站点编码', trigger: 'blur' }
|
||||
],
|
||||
siteName: [
|
||||
{ required: true, message: '请输入站点名称', trigger: 'blur' }
|
||||
],
|
||||
siteUrl: [
|
||||
{ required: true, message: '请输入站点网址', trigger: 'blur' }
|
||||
],
|
||||
},
|
||||
//当前处理的数据id
|
||||
dealId:null,
|
||||
|
||||
@ -87,7 +87,12 @@
|
||||
border
|
||||
:tree-props="{children: 'children', hasChildren: 'hasChildren'}"
|
||||
v-loading="loading" :data="categoryList" >
|
||||
<el-table-column label="栏目名称" align="left" prop="catgName" />
|
||||
<el-table-column label="栏目名称" align="left" prop="catgName" >
|
||||
<template slot-scope="scope" >
|
||||
{{scope.row.catgName}}
|
||||
<template v-if="checkIfAppCatg(scope.row.id)"><el-tag>APP</el-tag></template>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column label="栏目级别" align="center" prop="catgLevel">
|
||||
<template slot-scope="scope">
|
||||
{{scope.row.catgLevel}}级栏目
|
||||
@ -161,7 +166,7 @@ import { listCategory,delCategory} from "@/api/busi/category";
|
||||
export default {
|
||||
name: "Category",
|
||||
components:{},
|
||||
dicts: ['category_type'],
|
||||
dicts: ['category_type','app_menu'],
|
||||
data() {
|
||||
return {
|
||||
// 遮罩层
|
||||
@ -233,7 +238,18 @@ export default {
|
||||
const url = `https://www.cdtruck.com/OutOpen/AddEmailRecord?callback=result&fromEmail=alicesales@scdtrailer.com&pathPage=https://www.cdtruck.com/&typeid=F9&_=1751524873382`
|
||||
window.open(url, "_blank");
|
||||
},
|
||||
|
||||
/**
|
||||
* 判断是否APP栏目
|
||||
*/
|
||||
checkIfAppCatg(catgId){
|
||||
let arra = this.dict.type.app_menu
|
||||
for(let i=0;i<arra.length;i++){
|
||||
if(arra[i].label == catgId){
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
},
|
||||
/** 查询网站栏目列表 */
|
||||
getList() {
|
||||
this.loading = true;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user