文件本地上传+知识图谱功能

This commit is contained in:
朱春云 2025-11-30 20:08:07 +08:00
parent 018c447227
commit 7b4d6a9f54
2 changed files with 154 additions and 1 deletions

View File

@ -60,7 +60,7 @@
<dependency> <dependency>
<groupId>org.jsoup</groupId> <groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId> <artifactId>jsoup</artifactId>
<version>1.14.3</version> <version>1.15.3</version>
</dependency> </dependency>
<!-- Eclipse Paho MQTT 5.0 客户端 --> <!-- Eclipse Paho MQTT 5.0 客户端 -->

View File

@ -0,0 +1,153 @@
package xyz.playedu.common.util;
import com.alibaba.fastjson.JSON;
import com.google.gson.Gson;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
public class RichTextToJsonConverter {
// 雪花ID生成器相关常量
private static final long START_TIMESTAMP = 1609459200000L; // 2021-01-01 00:00:00
private static final long DATA_CENTER_ID_BITS = 5L;
private static final long WORKER_ID_BITS = 5L;
private static final long SEQUENCE_BITS = 12L;
private static final long MAX_DATA_CENTER_ID = ~(-1L << DATA_CENTER_ID_BITS);
private static final long MAX_WORKER_ID = ~(-1L << WORKER_ID_BITS);
private static final long MAX_SEQUENCE = ~(-1L << SEQUENCE_BITS);
private static final long WORKER_ID_SHIFT = SEQUENCE_BITS;
private static final long DATA_CENTER_ID_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS;
private static final long TIMESTAMP_LEFT_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS + DATA_CENTER_ID_BITS;
private static final long dataCenterId = 1L;
private static final long workerId = 1L;
private static final AtomicLong lastTimestamp = new AtomicLong(-1L);
private static final AtomicLong sequence = new AtomicLong(0L);
public static Map<String, Object> convertToStructuredJson(String html) {
Document doc = Jsoup.parse(html);
List<Map<String, Object>> elements = new ArrayList<>();
for (Element element : doc.body().children()) {
Map<String, Object> node = parseElement(element);
elements.add(node);
}
return Collections.singletonMap("content", elements);
}
/**
* 为HTML文档中的所有元素添加唯一ID属性
* @param doc HTML文档对象
*/
private static String addIdAttributes(String html) {
Document doc = Jsoup.parse(html);
// 获取body内的所有元素
Elements allElements = doc.body().select("*");
for (Element element : allElements) {
// 为每个元素设置唯一雪花ID
element.attr("id", "elem_" + nextId());
}
return doc.html();
}
/**
* 雪花ID生成算法实现
* @return 唯一的雪花ID
*/
private static synchronized long nextId() {
long timestamp = System.currentTimeMillis();
if (timestamp < lastTimestamp.get()) {
throw new RuntimeException("Clock moved backwards. Refusing to generate id for "
+ (lastTimestamp.get() - timestamp) + " milliseconds");
}
if (timestamp == lastTimestamp.get()) {
long currentSequence = sequence.incrementAndGet();
if (currentSequence > MAX_SEQUENCE) {
timestamp = tilNextMillis(timestamp);
sequence.set(0L);
}
} else {
sequence.set(0L);
}
lastTimestamp.set(timestamp);
return ((timestamp - START_TIMESTAMP) << TIMESTAMP_LEFT_SHIFT)
| (dataCenterId << DATA_CENTER_ID_SHIFT)
| (workerId << WORKER_ID_SHIFT)
| sequence.get();
}
/**
* 等待下一毫秒
* @param lastTimestamp 上次时间戳
* @return 下一毫秒时间戳
*/
private static long tilNextMillis(long lastTimestamp) {
long timestamp = System.currentTimeMillis();
while (timestamp <= lastTimestamp) {
timestamp = System.currentTimeMillis();
}
return timestamp;
}
private static Map<String, Object> parseElement(Element element) {
Map<String, Object> result = new HashMap<>();
result.put("type", element.tagName());
result.put("text", element.text());
result.put("html", element.html());
// 提取元素的ID属性
String elementId = element.attr("id");
if (!elementId.isEmpty()) {
result.put("id", elementId);
}
// 记录位置信息需结合渲染时的实际字符偏移
result.put("position", calculatePosition(element));
if (!element.children().isEmpty()) {
List<Map<String, Object>> children = new ArrayList<>();
for (Element child : element.children()) {
children.add(parseElement(child));
}
result.put("children", children);
}
return result;
}
private static Map<String, Integer> calculatePosition(Element element) {
// 这里可以基于 DOM textContent offset 来计算字符位置
// 实际项目中可能需要配合前端渲染时的字符索引
Map<String, Integer> pos = new HashMap<>();
pos.put("charStart", 0); // 示例值实际需动态计算
pos.put("charEnd", element.text().length());
return pos;
}
public static void main(String[] args) {
// 测试数据模拟教材中的富文本内容
String html = "<h1><strong style=\"background-color: rgb(255, 255, 255);\">丹青绘峥嵘 星火传精神 | 北京恒爱慈助公益基金会参加“人民必胜”主题党日活动</strong></h1><p>2025年正值中国人民抗日战争暨世界反法西斯战争胜利80周年。9月3日盛大阅兵仪式的震撼与感动仍在心中激荡次日9月4日北京市扶贫济困领域基金会第三联合党委组织参观“人民必胜——纪念中国人民抗日战争暨世界反法西斯战争胜利80周年美术作品展”又将这份爱国激情再度升华。于无声画作里听见历史的怒吼于厚重色彩间感触民族的血脉这次主题党日活动完成了一场从视觉直达心灵的爱国崇高接力。</p><p><br></p><p><img src=\"/cmsProject/profile/upload/2025/11/13/4 北京恒爱慈助公益基金会参加“人民必胜”主题党日活动_20251113151000A009.jpg\"></p><p><br></p><p>北京恒爱慈助公益基金会理事长胡忠信先生和秘书长王萌女士参与了此次活动,他们怀着无比崇敬与激动的心情,在记录重大战役的恢弘画作前驻足凝思,透过那斑斓的色彩和刚劲的线条,仿佛看到硝烟弥漫的战场,听到震耳欲聋的枪炮声,感受到将士们浴血奋战、视死如归的壮志豪情。</p><p><br></p><p><img src=\"/cmsProject/profile/upload/2025/11/13/图片1_20251113151126A010.png\"></p><p>在一件件栩栩如生的雕塑前,大家轻声交流,眼神中满是感动与敬仰,那一幕幕军民携手、共克时艰的温暖场景,如同冬日里的暖阳,照亮了每一个人的心灵。一件件充满力量感的艺术作品,宛如一把把钥匙,打开了历史的大门,让抗战时期的苦难历程与不屈精神穿越时空,直抵内心,现场氛围庄严肃穆。</p><p><img src=\"/cmsProject/profile/upload/2025/11/13/图片2_20251113151141A011.png\"></p><p><br></p><p>铭记历史、缅怀先烈、珍爱和平、开创未来,不是空洞的口号,需要我们用实际行动去弘扬、去传承。胡理事长表示:抗战精神中“天下兴亡、匹夫有责”的爱国情怀,“百折不挠、坚韧不拔”的必胜信念,正是新时代公益人需要传承的精神内核。我们要用每一次真诚援助传递温暖,用每一份爱心善举诠释责任,书写公益慈善壮丽篇章,让抗战精神在新时代熠熠生辉!</p>";
String withIdHtml = addIdAttributes(html);
System.out.println(withIdHtml);
// 调用转换方法
Map<String, Object> result = convertToStructuredJson(withIdHtml);
// 使用 FastJSON 打印结果
System.out.println(JSON.toJSONString(result, true));
}
}