diff --git a/dl_admin/ruoyi-admin/src/main/java/com/ruoyi/busi/service/impl/BusiKeywordItemServiceImpl.java b/dl_admin/ruoyi-admin/src/main/java/com/ruoyi/busi/service/impl/BusiKeywordItemServiceImpl.java index 1f77150..9f498cc 100644 --- a/dl_admin/ruoyi-admin/src/main/java/com/ruoyi/busi/service/impl/BusiKeywordItemServiceImpl.java +++ b/dl_admin/ruoyi-admin/src/main/java/com/ruoyi/busi/service/impl/BusiKeywordItemServiceImpl.java @@ -7,10 +7,14 @@ import cn.hutool.core.date.DateUtil; import com.alibaba.fastjson2.JSONObject; import com.baomidou.mybatisplus.core.metadata.IPage; import com.baomidou.mybatisplus.extension.plugins.pagination.Page; +import com.ruoyi.base.domain.BaseSite; +import com.ruoyi.base.service.IBaseSiteInfoService; +import com.ruoyi.base.service.IBaseSiteService; import com.ruoyi.busi.domain.BusiKeyword; import com.ruoyi.busi.service.IBusiKeywordService; import com.ruoyi.busi.vo.BusiKeywordItemQueryVO; import com.ruoyi.busi.vo.BusiKeywordRankStatVO; +import com.ruoyi.common.utils.GoogleUtil; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; @@ -31,6 +35,8 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl queryListPage(BusiKeywordItem pageReqVO, Page page) { @@ -46,6 +52,9 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl siteList = baseSiteService.list(); + Map siteMap = siteList.stream().collect(Collectors.toMap(BaseSite::getId, BaseSite::getSiteUrl)); List keywordList = busiKeywordService.list(); //删除今日所有的排名数据 busiKeywordItemMapper.deleteBySelectDateInt(DateUtil.formatDate(nowDate)); @@ -54,8 +63,12 @@ public class BusiKeywordItemServiceImpl extends ServiceImpl0 && ranking<=20){ + busiKeywordItem.setRanking(ranking); + }else{ + busiKeywordItem.setRanking(0); + } busiKeywordItem.setTenantId(keyword.getTenantId()); insertList.add(busiKeywordItem); } diff --git a/dl_admin/ruoyi-admin/src/main/resources/application-druid.yml b/dl_admin/ruoyi-admin/src/main/resources/application-druid.yml index c6a11ed..15f87da 100644 --- a/dl_admin/ruoyi-admin/src/main/resources/application-druid.yml +++ b/dl_admin/ruoyi-admin/src/main/resources/application-druid.yml @@ -4,16 +4,16 @@ spring: type: com.alibaba.druid.pool.DruidDataSource driverClassName: com.mysql.cj.jdbc.Driver druid: - # 主库数据源-点亮开发库 - master: - url: jdbc:mysql://82.156.161.160:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8 - username: site - password: 123456 - #主库数据源-客户测试服务器 +# # 主库数据源-点亮开发库 # master: -# url: jdbc:mysql://127.0.0.1:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8 +# url: jdbc:mysql://82.156.161.160:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8 # username: site # password: 123456 +# 主库数据源-客户测试服务器 + master: + url: jdbc:mysql://1.92.99.15:3306/dl_site_system?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8 + username: site + password: 123456 # 从库数据源 slave: # 从数据源开关/默认关闭 diff --git a/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java b/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java index 5310f35..e774171 100644 --- a/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java +++ b/dl_admin/ruoyi-common/src/main/java/com/ruoyi/common/utils/GoogleUtil.java @@ -13,29 +13,33 @@ import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import io.github.bonigarcia.wdm.WebDriverManager; +import org.openqa.selenium.By; +import org.openqa.selenium.support.ui.WebDriverWait; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.springframework.stereotype.Component; + import java.net.URLEncoder; +import java.net.URLDecoder; +import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.HashMap; +import java.util.Map; +import java.nio.file.Files; +import java.nio.file.Paths; public class GoogleUtil { private static final Logger log = LoggerFactory.getLogger(GoogleUtil.class); - // 可配置参数(Ruoyi项目建议用@Value从application.yml读取) + // 行为开关:是否保留浏览器窗口(默认为 false,可在运行时通过 setter 修改) + private static volatile boolean KEEP_BROWSER_OPEN = false; + // 可配置参数 private static final String GOOGLE_SEARCH_URL = "https://www.google.com/search?q="; - private static final String PROXY_HOST = "127.0.0.1"; - private static final int PROXY_PORT = 7897; private static final int CONNECT_TIMEOUT = 10000; private static final int READ_TIMEOUT = 15000; - // 重试机制 - private static final Retryer RETRYER = RetryerBuilder.newBuilder() - .retryIfExceptionOfType(Exception.class) - .retryIfResult(result -> result != null && !result) - .withStopStrategy(StopStrategies.stopAfterAttempt(3)) - .withWaitStrategy(WaitStrategies.fixedWait(1, TimeUnit.SECONDS)) - .build(); - // 扩展User-Agent池(增加移动端标识,反爬更友好) private static final List USER_AGENTS = Arrays.asList( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", @@ -45,6 +49,21 @@ public class GoogleUtil { "Mozilla/5.0 (Android 14; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0" ); + public static void setKeepBrowserOpen(boolean keep) { + KEEP_BROWSER_OPEN = keep; + } + + public static boolean isKeepBrowserOpen() { + return KEEP_BROWSER_OPEN; + } + + // 是否要求页面 UI 显示“网页/全部”筛选(默认开启)。若未检测到则跳过本次采集。 + private static volatile boolean REQUIRE_WEB_TAB = true; + + public static void setRequireWebTab(boolean require) { + REQUIRE_WEB_TAB = require; + } + /** * 核心方法:获取Google搜索排名(使用Selenium实现) */ @@ -88,6 +107,11 @@ public class GoogleUtil { return 0; } String finalUrl = GOOGLE_SEARCH_URL + encodedSearchText; + // 允许指定 gl(地域) + String gl = System.getenv("GOOGLE_GL"); + if (gl != null && !gl.trim().isEmpty()) { + finalUrl = finalUrl + "&gl=" + gl.trim(); + } log.info("发起Google搜索请求:url={}, 目标网站={}", finalUrl, webSite); WebDriver driver = null; @@ -96,16 +120,57 @@ public class GoogleUtil { driver = createWebDriver(); int rank = -1; - // 搜索前三页 - for (int page = 1; page <= 3; page++) { + // 仅搜索前2页(前20名) + for (int page = 1; page <= 2; page++) { String pageUrl = page == 1 ? finalUrl : finalUrl + "&start=" + ((page - 1) * 10); + // 统一参数:关闭个性化、语言,仅网页结果(udm=14) + pageUrl += "&hl=zh-CN&pws=0&num=10&udm=14"; log.info("搜索第{}页: {}", page, pageUrl); + // 先访问 NCR 以避免地区重定向 + try { + driver.get("https://www.google.com/ncr"); + } catch (Exception ignore) { + } + // 访问后立刻检查是否跳转到人机验证/异常流量页面 + try { + String cur = driver.getCurrentUrl().toLowerCase(); + if (cur.contains("/sorry/") || cur.contains("sorry/index") || cur.contains("security-check") || cur.contains("captcha")) { + log.warn("发现跳转至验证页面: {}", cur); + return -2; + } + } catch (Exception e) { + log.warn("获取当前URL失败,可能窗口被关闭或被拦截: {}", e.getMessage()); + return -2; + } + + // 访问Google搜索页面 driver.get(pageUrl); - // 随机延时,模拟人类行为(增加延时以减少被识别为机器人的可能性) - Thread.sleep((long) (3000 + Math.random() * 5000)); + // 显式等待搜索结果区域、标题出现,避免未加载完全 + try { + WebDriverWait wait = new WebDriverWait(driver, 10); + wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div#search"))); + // 等待有机结果容器 + wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div#search div.MjjYud div.tF2Cxc, div#search a h3"))); + } catch (Exception e) { + log.warn("等待搜索结果元素出现超时: {}", e.getMessage()); + } + + // 人类化随机等待 + Thread.sleep((long) (1500 + Math.random() * 3000)); + + // 轻微滚动,模拟浏览 + try { + org.openqa.selenium.JavascriptExecutor js = (org.openqa.selenium.JavascriptExecutor) driver; + long steps = 2 + (int) (Math.random() * 3); + for (int i = 0; i < steps; i++) { + js.executeScript("window.scrollBy(0, arguments[0]);", 200 + (int) (Math.random() * 400)); + Thread.sleep((long) (400 + Math.random() * 600)); + } + } catch (Exception ignore) { + } // 检查是否出现了验证码页面 if (isCaptchaPage(driver)) { @@ -117,19 +182,40 @@ public class GoogleUtil { String html = driver.getPageSource(); log.debug("第{}页请求成功:响应长度={}字节", page, html.length()); + // 如需页面必须显示“网页/全部”筛选标签,则检查 UI,否则跳过采集 + if (REQUIRE_WEB_TAB) { + try { + String curUrl = null; + try { curUrl = driver.getCurrentUrl(); } catch (Exception ignore) {} + boolean hasWebTab = (curUrl != null && curUrl.contains("udm=14")) + || html.contains(">网页<") || html.contains(">全部<") + || html.contains(">Web<") || html.contains(">All<"); + if (!hasWebTab) { + log.warn("未检测到‘网页/全部’筛选标签,跳过本次采集(REQUIRE_WEB_TAB=true)"); + return 0; + } + } catch (Exception ignore) { + } + } + // 解析当前页的排名 - int pageRank = parseRankFromHtml(html, webSite, (page - 1) * 10); + int baseRankForPage = (page - 1) * 10; + int pageRank = parseRankFromHtml(html, webSite, baseRankForPage); + // 写入本页有机结果清单(仅用于排查) + try { + dumpOrganicList(Jsoup.parse(html), baseRankForPage); + } catch (Exception ignore) { + } if (pageRank > 0) { rank = pageRank; break; } // 页面间增加随机延时 - if (page < 3) { + if (page < 2) { Thread.sleep((long) (2000 + Math.random() * 3000)); } } - return rank; } catch (Exception e) { log.error("获取排名异常:url={}, 原因={}", finalUrl, e.getMessage(), e); @@ -137,10 +223,14 @@ public class GoogleUtil { } finally { // 关闭浏览器 if (driver != null) { - try { - driver.quit(); - } catch (Exception e) { - log.error("关闭浏览器失败", e); + if (!KEEP_BROWSER_OPEN) { + try { + driver.quit(); + } catch (Exception e) { + log.error("关闭浏览器失败", e); + } + } else { + log.info("根据配置保留浏览器窗口开启(KEEP_BROWSER_OPEN=true)"); } } } @@ -148,48 +238,78 @@ public class GoogleUtil { /** * 创建WebDriver实例 + * * @return WebDriver */ private static WebDriver createWebDriver() { - // 设置ChromeDriver路径(根据实际路径调整) - System.setProperty("webdriver.chrome.driver", "D:/chromedriver-win64/chromedriver.exe"); + // 优先使用 WebDriverManager 自动管理驱动 + try { + WebDriverManager.chromedriver().setup(); + log.info("WebDriverManager 已自动配置 chromedriver"); + } catch (Throwable t) { + // 允许失败后使用手动路径 + log.warn("WebDriverManager 配置失败,尝试使用手动路径: {}", t.getMessage()); + String sysProp = System.getProperty("webdriver.chrome.driver"); + String envProp = System.getenv("CHROMEDRIVER_PATH"); + if (sysProp == null || sysProp.isEmpty()) { + String path = (envProp != null && !envProp.isEmpty()) ? envProp : "D:/chromedriver.exe"; + System.setProperty("webdriver.chrome.driver", path); + log.info("使用ChromeDriver路径: {}", path); + } else { + log.info("检测到系统属性webdriver.chrome.driver: {}", sysProp); + } + } ChromeOptions options = new ChromeOptions(); - // 设置无头模式 (Selenium 3.x语法) - options.addArguments("--headless"); - // 设置User-Agent + // 按环境变量决定是否 headless(默认有头,更像真实用户) + boolean headless = Boolean.parseBoolean(System.getenv().getOrDefault("GOOGLE_HEADLESS", "false")); + if (headless) options.addArguments("--headless=new"); + + // 语言与用户体验设置 + options.addArguments("--lang=zh-CN"); + Map prefs = new HashMap<>(); + prefs.put("intl.accept_languages", "zh-CN,zh;q=0.9"); + options.setExperimentalOption("prefs", prefs); + + // User-Agent 随机化 String randomUserAgent = USER_AGENTS.get((int) (Math.random() * USER_AGENTS.size())); options.addArguments("--user-agent=" + randomUserAgent); - // 禁用图片加载提高速度 - options.addArguments("--blink-settings=imagesEnabled=false"); - // 设置窗口大小 - options.addArguments("--window-size=1920,1080"); - // 禁用自动化控制特征 + + // 真实窗口大小(避免固定指纹) + int width = 1024 + (int) (Math.random() * 600); + int height = 700 + (int) (Math.random() * 500); + options.addArguments("--window-size=" + width + "," + height); + + // 尽量减少自动化特征 options.addArguments("--disable-blink-features=AutomationControlled"); options.setExperimentalOption("excludeSwitches", Arrays.asList("enable-automation")); - // 禁用自动化标志 options.addArguments("--disable-extensions"); options.addArguments("--no-sandbox"); options.addArguments("--disable-dev-shm-usage"); - // 禁用SSL错误 options.addArguments("--ignore-ssl-errors"); options.addArguments("--ignore-certificate-errors"); - // 禁用日志 - options.addArguments("--log-level=3"); - options.addArguments("--silent"); - // 如果需要使用代理(轮换代理IP可以有效避免验证码) - /* - if (PROXY_HOST != null && !PROXY_HOST.isEmpty()) { - // 可以在这里集成代理IP服务,每次使用不同的IP - options.addArguments("--proxy-server=http://" + PROXY_HOST + ":" + PROXY_PORT); + // 复用用户数据目录(可降低验证码概率) + String userDataDir = System.getenv("GOOGLE_CHROME_USER_DATA_DIR"); + if (userDataDir != null && !userDataDir.trim().isEmpty()) { + options.addArguments("--user-data-dir=" + userDataDir.trim()); } - */ + + // 根据需求保留浏览器窗口(以静态变量为准) + boolean detach = KEEP_BROWSER_OPEN; // 当需要保留窗口时,开启 detach + options.setExperimentalOption("detach", detach); + if (detach) { + log.info("ChromeOptions 已启用 detach={}, 浏览器可能在会话结束后保持打开", detach); + } + WebDriver driver = new ChromeDriver(options); - // 执行JavaScript隐藏webdriver属性 - ((org.openqa.selenium.JavascriptExecutor) driver).executeScript( - "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"); + // 隐藏 webdriver 标志 + try { + ((org.openqa.selenium.JavascriptExecutor) driver).executeScript( + "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"); + } catch (Exception ignore) { + } return driver; } @@ -206,137 +326,202 @@ public class GoogleUtil { } } + /** + * 将当前页面识别到的有机结果(已过滤 PAA)按序写入临时文件,便于对照 + */ + private static void dumpOrganicList(Document doc, int baseRank) { + try { + List lines = new java.util.ArrayList<>(); + java.util.Set printed = new java.util.HashSet<>(); + int idx = 0; + // 优先使用 tF2Cxc 容器 + Elements tfItems = doc.select("div.tF2Cxc"); + for (Element item : tfItems) { + // 仅基于 tF2Cxc 容器(udm=14 已筛选网页,通常无 PAA 干扰) + Element a = item.selectFirst(".yuRUbf a[href]"); + if (a == null) continue; + String href = a.attr("href"); + // 展开 /url?q= + if (href != null && href.startsWith("/url?")) { + String query = href.substring(href.indexOf('?') + 1); + for (String pair : query.split("&")) { + int idxEq = pair.indexOf('='); + if (idxEq > 0) { + String key = pair.substring(0, idxEq); + String val = pair.substring(idxEq + 1); + if ("q".equals(key)) { + try { + href = URLDecoder.decode(val, StandardCharsets.UTF_8.name()); + } catch (Exception ignore) { + } + break; + } + } + } + } + String hrefHost = extractHost(href); + String citeHost = extractDisplayHostFromItem(item); + String effHost = (hrefHost != null && !hrefHost.isEmpty()) ? hrefHost : citeHost; + if (effHost == null || effHost.isEmpty()) continue; + if (effHost.endsWith("google.com") || effHost.endsWith("google.cn") || effHost.contains("google.")) + continue; + String key = (effHost == null ? "" : effHost) + "|" + (href == null ? "" : href); + if (!printed.add(key)) continue; + idx++; + int rank = baseRank + idx; + String line = String.format("#%d citeHost=%s hrefHost=%s href=%s", rank, citeHost, hrefHost, href); + lines.add(line); + log.info("本页有机{}: {}", idx, line); + if (idx >= 10) break; + } + // 兜底:如不足10条,回退到 a h3 结构补足 + if (idx < 10) { + // 保持首页清单完整性的同时,尽量避免 fallback 误入非有机卡片;若你不需要 fallback 可以将本块去掉 + } + log.info("本页有机共{}条", idx); + } catch (Exception e) { + log.warn("写入有机结果清单失败: {}", e.getMessage()); + } + } + /** * 解析排名 */ private static int parseRankFromHtml(String html, String targetWebSite, int baseRank) { Document doc = Jsoup.parse(html); - // 获取所有可能的搜索结果容器 - Elements resultContainers = doc.select("div.g"); - - if (resultContainers.isEmpty()) { - log.warn("未找到任何搜索结果容器"); - return -1; - } - - log.debug("找到 {} 个搜索结果容器", resultContainers.size()); - - String cleanTarget = targetWebSite - .replace("https://", "") - .replace("http://", "") - .replace("www.", ""); - - // 如果目标网站包含路径,则只取域名部分进行比较 - if (cleanTarget.contains("/")) { - cleanTarget = cleanTarget.substring(0, cleanTarget.indexOf("/")); - } + // 优先按有机结果容器顺序计数:广义抓取 #search 下的 tF2Cxc,再在循环内过滤 PAA(Wt5Tfe/related-question-pair) + Elements items = doc.select("div.tF2Cxc"); + String cleanTarget = normalizeHost(targetWebSite); int validResultCount = 0; + java.util.Set counted = new java.util.HashSet<>(); + if (!items.isEmpty()) { + for (Element item : items) { + // 仅基于 tF2Cxc 容器(udm=14 已筛选网页,通常无 PAA 干扰) - for (Element container : resultContainers) { - // 跳过图片和视频区块 - if (container.select("div.XaIwc, div.islrc, div.JTuIPc, div.PiKbc, div[jscontroller]").size() > 0) { - continue; - } + Element a = item.selectFirst(".yuRUbf a[href]"); + if (a == null) continue; + String href = a.attr("href"); + if (href == null || href.isEmpty()) continue; - // 查找容器中的链接 - Elements links = container.select("a[href]"); - if (links.isEmpty()) { - continue; - } + // 展开 /url?q= 形式 + if (href.startsWith("/url?")) { + String query = href.substring(href.indexOf('?') + 1); + for (String pair : query.split("&")) { + int idx = pair.indexOf('='); + if (idx > 0) { + String key = pair.substring(0, idx); + String val = pair.substring(idx + 1); + if ("q".equals(key)) { + try { + href = URLDecoder.decode(val, StandardCharsets.UTF_8.name()); + } catch (Exception ignore) { + } + break; + } + } + } + } - boolean isSearchResult = false; - String resultUrl = ""; - - // 查找有效的搜索结果链接 - for (Element link : links) { - String href = link.attr("href"); - - // 跳过Google内部链接 - if (href.contains("google.com") || href.startsWith("/url?") || href.startsWith("#")) { + String host = extractHost(href); + String citeHost = extractDisplayHostFromItem(item); + if (citeHost == null || citeHost.isEmpty()) citeHost = host; + if (host == null || host.isEmpty()) host = citeHost; + if (host == null || host.isEmpty()) continue; + if (host.endsWith("google.com") || host.endsWith("google.cn") || host.contains("google.")) { continue; } - - // 检查是否是标题链接(通常在h3标签内) - if (link.parent() != null && link.parent().tagName().equals("h3")) { - isSearchResult = true; - resultUrl = href; - break; + String key = (host == null ? "" : host) + "|" + (href == null ? "" : href); + if (!counted.add(key)) continue; // 去重,确保与dump一致 + validResultCount++; + if (host.equalsIgnoreCase(cleanTarget) || citeHost.equalsIgnoreCase(cleanTarget) || + host.endsWith("." + cleanTarget) || citeHost.endsWith("." + cleanTarget) || + cleanTarget.endsWith("." + host) || cleanTarget.endsWith("." + citeHost)) { + int actualRank = baseRank + validResultCount; + log.info("找到目标网站:排名={}, URL={}, host={}, citeHost={}, target={}", actualRank, href, host, citeHost, cleanTarget); + return actualRank; } - - // 或者检查是否在常见的搜索结果区块中 - Element parent = link.parent(); - while (parent != null) { - if (parent.hasClass("yuRUbf") || parent.hasClass("tF2Cxc")) { - isSearchResult = true; - resultUrl = href; - break; - } - parent = parent.parent(); - } - - if (isSearchResult) { - break; - } - } - - if (!isSearchResult) { - continue; - } - - validResultCount++; - - String cleanResult = resultUrl - .replace("https://", "") - .replace("http://", "") - .replace("www.", ""); - - // 如果结果URL包含路径,则只取域名部分进行比较 - if (cleanResult.contains("/")) { - int firstSlash = cleanResult.indexOf("/"); - if (firstSlash > 0) { - cleanResult = cleanResult.substring(0, firstSlash); - } - } - - // 匹配目标网站 - if (cleanResult.contains(cleanTarget) || cleanTarget.contains(cleanResult)) { - int actualRank = baseRank + validResultCount; - log.info("找到目标网站:排名={}, 结果URL={}, 目标网站={}", actualRank, resultUrl, targetWebSite); - return actualRank; } } - + // 不使用 h3 兜底机制,避免引入非有机卡片造成误差;仅以 tF2Cxc 且非 PAA 的结果计数 log.info("在当前页面未找到目标网站:targetWebSite={}", targetWebSite); return -1; } + + /** + * 从有机结果容器中提取展示域名(来自 .byrV5b > cite 的可见链接) + */ + private static String extractDisplayHostFromItem(Element item) { + try { + // 选择第一个“非 PAA 折叠容器(jsname=YrZdPb)内”的 cite 作为展示域名 + Elements cites = item.select(".byrV5b cite"); + Element cite = null; + for (Element c : cites) { + if (c.parents().select("[jsname=YrZdPb]").isEmpty()) { + cite = c; + break; + } + } + if (cite == null) return ""; + String txt = cite.text(); // 例如:https://zh.wikipedia.org › zh-cn › 大型貨車 + if (txt == null || txt.isEmpty()) return ""; + // 仅取 URL 主体部分,丢弃“ › ”后面的路径 + int idxArrow = txt.indexOf("›"); + if (idxArrow > 0) { + txt = txt.substring(0, idxArrow).trim(); + } + // 去掉多余空格 + txt = txt.replace(" ", ""); + // 提取 host + if (txt.startsWith("http")) { + try { + URL u = new URL(txt); + return normalizeHost(u.getHost()); + } catch (Exception ignore) { + } + } + // 如果不是完整URL,比如直接显示 zh.wikipedia.org + return normalizeHost(txt); + } catch (Exception e) { + return ""; + } + } + /** * 检查是否为验证码页面 + * * @param driver WebDriver实例 * @return 是否为验证码页面 */ private static boolean isCaptchaPage(WebDriver driver) { try { - String pageSource = driver.getPageSource().toLowerCase(); - String currentUrl = driver.getCurrentUrl().toLowerCase(); + String currentUrl = ""; + String pageSource = ""; + try { + currentUrl = driver.getCurrentUrl().toLowerCase(); + } catch (Exception ignore) { + } + try { + pageSource = driver.getPageSource().toLowerCase(); + } catch (Exception ignore) { + } - // 检查页面是否包含验证码相关关键词 + // URL 层面的强信号 + boolean isCaptchaUrl = currentUrl.contains("sorry/index") || + currentUrl.contains("security-check") || + currentUrl.contains("captcha"); + + // 页面文本层面的信号 boolean hasCaptchaKeywords = pageSource.contains("captcha") || pageSource.contains("recaptcha") || pageSource.contains("人机验证") || pageSource.contains("异常流量") || pageSource.contains("security check") || - pageSource.contains("sorry/index") || pageSource.contains("before we can serve your request"); - // 检查URL是否为验证码页面 - boolean isCaptchaUrl = currentUrl.contains("sorry/index") || - currentUrl.contains("security-check") || - currentUrl.contains("captcha"); - - return hasCaptchaKeywords || isCaptchaUrl; + return isCaptchaUrl || hasCaptchaKeywords; } catch (Exception e) { log.error("检查验证码页面时出错", e); return false; @@ -346,10 +531,41 @@ public class GoogleUtil { // 测试方法 public static void main(String[] args) { // 测试用例:搜索关键词,查找特定网站的排名 - int rank1 = getGoogleRank("大卡车", "zh.wikipedia.org"); + int rank1 = getGoogleRank("大卡车", "zh.wikipedia.org"); System.out.println("维基百科排名:" + formatRank(rank1)); } + /** + * 提取并规范化 host + */ + private static String extractHost(String urlStr) { + try { + // 绝对URL + if (urlStr.startsWith("http")) { + URL u = new URL(urlStr); + return normalizeHost(u.getHost()); + } + // 可能是相对或协议相对 + if (urlStr.startsWith("//")) { + URL u = new URL("https:" + urlStr); + return normalizeHost(u.getHost()); + } + return null; + } catch (Exception e) { + return null; + } + } + + private static String normalizeHost(String hostOrUrl) { + if (hostOrUrl == null || hostOrUrl.isEmpty()) return ""; + String s = hostOrUrl.trim(); + s = s.replace("https://", "").replace("http://", ""); + if (s.contains("/")) s = s.substring(0, s.indexOf('/')); + if (s.startsWith("www.")) s = s.substring(4); + return s.toLowerCase(); + } + + // 辅助方法:格式化排名输出 private static String formatRank(int rank) { if (rank > 0) return rank + "名"; diff --git a/dl_vue/src/views/base/site/index.vue b/dl_vue/src/views/base/site/index.vue index 5cf9a0d..d021a81 100644 --- a/dl_vue/src/views/base/site/index.vue +++ b/dl_vue/src/views/base/site/index.vue @@ -114,7 +114,7 @@ - + @@ -179,7 +179,15 @@ export default { form: {}, // 表单校验 rules: { - + id: [ + { required: true, message: '请输入站点编码', trigger: 'blur' } + ], + siteName: [ + { required: true, message: '请输入站点名称', trigger: 'blur' } + ], + siteUrl: [ + { required: true, message: '请输入站点网址', trigger: 'blur' } + ], }, //当前处理的数据id dealId:null, diff --git a/dl_vue/src/views/busi/category/index.vue b/dl_vue/src/views/busi/category/index.vue index a2e0790..8e0a3c9 100644 --- a/dl_vue/src/views/busi/category/index.vue +++ b/dl_vue/src/views/busi/category/index.vue @@ -87,7 +87,12 @@ border :tree-props="{children: 'children', hasChildren: 'hasChildren'}" v-loading="loading" :data="categoryList" > - + + +