GetWeChatArticleService.java 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. package com.cslg.ppa.service.GetWebArticle;
  2. import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  3. import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
  4. import com.cslg.ppa.common.core.base.RedisConf;
  5. import com.cslg.ppa.common.utils.DateUtil;
  6. import com.cslg.ppa.common.utils.RedisUtil;
  7. import com.cslg.ppa.dto.GetArticleInfoDTO;
  8. import com.cslg.ppa.entity.ArticleInfo;
  9. import com.cslg.ppa.entity.SourceInfo;
  10. import com.cslg.ppa.entity.commom.Article;
  11. import com.cslg.ppa.entity.commom.WxResultBody;
  12. import com.cslg.ppa.mapper.SourceInfoMapper;
  13. import com.cslg.ppa.service.ArticleInfoService;
  14. import com.cslg.ppa.service.commom.DifyService;
  15. import com.cslg.ppa.service.commom.FileManagerService;
  16. import com.cslg.ppa.service.commom.WeiXinApi;
  17. import lombok.RequiredArgsConstructor;
  18. import lombok.extern.slf4j.Slf4j;
  19. import org.apache.commons.io.IOUtils;
  20. import org.apache.commons.lang3.ObjectUtils;
  21. import org.apache.commons.lang3.StringUtils;
  22. import org.apache.http.client.methods.CloseableHttpResponse;
  23. import org.apache.http.client.methods.HttpGet;
  24. import org.apache.http.impl.client.CloseableHttpClient;
  25. import org.apache.http.impl.client.HttpClients;
  26. import org.apache.http.util.EntityUtils;
  27. import org.jsoup.Jsoup;
  28. import org.jsoup.nodes.Document;
  29. import org.jsoup.nodes.Element;
  30. import org.jsoup.select.Elements;
  31. import org.springframework.beans.factory.annotation.Autowired;
  32. import org.springframework.beans.factory.annotation.Value;
  33. import org.springframework.scheduling.annotation.Scheduled;
  34. import org.springframework.stereotype.Service;
  35. import org.springframework.transaction.annotation.Propagation;
  36. import org.springframework.transaction.annotation.Transactional;
  37. import org.springframework.util.CollectionUtils;
  38. import java.io.File;
  39. import java.io.FileOutputStream;
  40. import java.io.IOException;
  41. import java.io.InputStream;
  42. import java.net.HttpURLConnection;
  43. import java.net.MalformedURLException;
  44. import java.net.URL;
  45. import java.nio.file.Files;
  46. import java.text.SimpleDateFormat;
  47. import java.util.ArrayList;
  48. import java.util.Collections;
  49. import java.util.Date;
  50. import java.util.List;
  51. import java.util.regex.Matcher;
  52. import java.util.regex.Pattern;
  53. import java.util.stream.Collectors;
  54. @Slf4j
  55. @Service
  56. @RequiredArgsConstructor
  57. public class GetWeChatArticleService {
  58. @Autowired
  59. private SourceInfoMapper sourceInfoMapper;
  60. @Autowired
  61. private ArticleInfoService articleInfoService;
  62. @Autowired
  63. private DifyService difyService;
  64. @Autowired
  65. private FileManagerService fileManagerService;
  66. @Autowired
  67. private RedisUtil redisUtil;
  68. // @Scheduled(cron = "0 0 6 * * ?")
  69. // @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Throwable.class)
  70. public void getWeChatArticle() throws Exception {
  71. String token = redisUtil.get(RedisConf.WECHAT_TOKEN + RedisConf.SYMBOL_COLON);
  72. String cookie = redisUtil.get(RedisConf.WECHAT_COOKIE + RedisConf.SYMBOL_COLON);
  73. System.out.println(new Date() + "Wechat-Begin");
  74. List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
  75. .eq(SourceInfo::getSourceType, 2));
  76. for (SourceInfo sourceInfo : sourceInfos) {
  77. final String fakeId = sourceInfo.getFakeId();
  78. WxResultBody<List<Article>> findExList = WeiXinApi.findExList(fakeId, token, cookie);
  79. List<Article> exList = findExList.getApp_msg_list();
  80. List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
  81. for (Article article : exList) {
  82. String createTimeSecondStr = article.getCreate_time();
  83. long secondCreateTime = Long.parseLong(createTimeSecondStr);
  84. String createTimeStr = DateUtil.convertTimestamp(secondCreateTime);
  85. String yesterdayDateStr = DateUtil.getYesterdayDateStr();
  86. if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) {
  87. break;
  88. }
  89. Date createTime = new Date();
  90. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  91. try {
  92. createTime = dateFormat.parse(createTimeStr);
  93. } catch (Exception e) {
  94. continue;
  95. }
  96. String title = article.getTitle();
  97. String link = article.getLink();
  98. String cover = article.getCover();
  99. // 检查文章是否已存在
  100. long sum = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
  101. .eq(ArticleInfo::getTitle, title)
  102. .or()
  103. .eq(ArticleInfo::getArticleUrl, link));
  104. if (sum > 0) {
  105. continue; // 文章已存在,跳过
  106. }
  107. //获取公众号内容
  108. String weChatArticleContent = null;
  109. try {
  110. weChatArticleContent = this.getWeChatArticleContent(link, cookie);
  111. } catch (Exception e) {
  112. weChatArticleContent = "";
  113. }
  114. if (StringUtils.isEmpty(weChatArticleContent)) {
  115. continue;
  116. }
  117. String condensedAbstract = null;
  118. try {
  119. condensedAbstract = difyService.getCondensedAbstract(weChatArticleContent);
  120. } catch (Exception e) {
  121. continue;
  122. }
  123. if (StringUtils.isEmpty(condensedAbstract)) {
  124. continue;
  125. }
  126. GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
  127. switch (sourceInfo.getSourceName()) {
  128. case "国专知识产权公众号":
  129. case "知识产权界公众号":
  130. case "IPRdaily公众号":
  131. case "Bayes美国知识产权公众号":
  132. articleInfoDTO.setCategoryId(this.wxArticleClassify(title));
  133. break;
  134. case "跨域知见公众号":
  135. articleInfoDTO.setCategoryId(12);
  136. break;
  137. default:
  138. articleInfoDTO.setCategoryId(6);
  139. break;
  140. }
  141. articleInfoDTO.setSourceId(sourceInfo.getId());
  142. articleInfoDTO.setArticleUrl(link);
  143. articleInfoDTO.setTitle(title);
  144. articleInfoDTO.setPublicTime(createTime);
  145. if (StringUtils.isNotEmpty(cover)) {
  146. String guid = this.getGuid(cover);
  147. articleInfoDTO.setWxArticleIcon(guid);
  148. }
  149. articleInfoDTO.setDigest(condensedAbstract);
  150. articleInfoDTOS.add(articleInfoDTO);
  151. }
  152. articleInfoService.batchAddArticleInfo(articleInfoDTOS);
  153. }
  154. System.out.println(new Date() + "Wechat-End");
  155. }
  156. public void saveWeChatArticle(String fakeId, String sourceName, Integer sourceInfoId) {
  157. String token = redisUtil.get(RedisConf.WECHAT_TOKEN + RedisConf.SYMBOL_COLON);
  158. String cookie = redisUtil.get(RedisConf.WECHAT_COOKIE + RedisConf.SYMBOL_COLON);
  159. WxResultBody<List<Article>> findExList = WeiXinApi.findExList(fakeId, token, cookie);
  160. List<Article> exList = findExList.getApp_msg_list();
  161. List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
  162. for (Article article : exList) {
  163. String createTimeSecondStr = article.getCreate_time();
  164. long secondCreateTime = Long.parseLong(createTimeSecondStr);
  165. String createTimeStr = DateUtil.convertTimestamp(secondCreateTime);
  166. String yesterdayDateStr = DateUtil.getYesterdayDateStr();
  167. if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) {
  168. break;
  169. }
  170. Date createTime = new Date();
  171. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  172. try {
  173. createTime = dateFormat.parse(createTimeStr);
  174. } catch (Exception e) {
  175. continue;
  176. }
  177. String title = article.getTitle().trim();
  178. String link = article.getLink();
  179. String cover = article.getCover();
  180. // 检查文章是否已存在
  181. long sum = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
  182. .eq(ArticleInfo::getTitle, title)
  183. .or()
  184. .eq(ArticleInfo::getArticleUrl, link));
  185. if (sum > 0) {
  186. continue; // 文章已存在,跳过
  187. }
  188. //获取公众号内容
  189. String weChatArticleContent = null;
  190. try {
  191. weChatArticleContent = this.getWeChatArticleContent(link,cookie);
  192. } catch (Exception e) {
  193. weChatArticleContent = "";
  194. }
  195. if (StringUtils.isEmpty(weChatArticleContent)) {
  196. continue;
  197. }
  198. String condensedAbstract = null;
  199. try {
  200. condensedAbstract = difyService.getCondensedAbstract(weChatArticleContent);
  201. } catch (Exception e) {
  202. continue;
  203. }
  204. if (StringUtils.isEmpty(condensedAbstract)) {
  205. continue;
  206. }
  207. GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
  208. switch (sourceName) {
  209. case "国专知识产权公众号":
  210. case "知识产权界公众号":
  211. case "IPRdaily公众号":
  212. case "Bayes美国知识产权公众号":
  213. articleInfoDTO.setCategoryId(this.wxArticleClassify(title));
  214. break;
  215. case "跨域知见公众号":
  216. articleInfoDTO.setCategoryId(12);
  217. break;
  218. default:
  219. articleInfoDTO.setCategoryId(6);
  220. break;
  221. }
  222. articleInfoDTO.setSourceId(sourceInfoId);
  223. articleInfoDTO.setArticleUrl(link);
  224. articleInfoDTO.setTitle(title);
  225. articleInfoDTO.setPublicTime(createTime);
  226. if (StringUtils.isNotEmpty(cover)) {
  227. String guid = this.getGuid(cover);
  228. articleInfoDTO.setWxArticleIcon(guid);
  229. }
  230. articleInfoDTO.setDigest(condensedAbstract);
  231. articleInfoDTOS.add(articleInfoDTO);
  232. }
  233. articleInfoService.batchAddArticleInfo(articleInfoDTOS);
  234. }
  235. public Integer wxArticleClassify(String content) {
  236. int classify = 6;
  237. try {
  238. String classifyStr = difyService.getArticleClassify(content);
  239. if (StringUtils.isNotEmpty(classifyStr)) {
  240. if (classifyStr.contains("判例")) {
  241. classify = 3;
  242. } else if (classifyStr.contains("国外")) {
  243. classify = 4;
  244. } else if (classifyStr.contains("行业")) {
  245. classify = 5;
  246. }
  247. }
  248. } catch (Exception e) {
  249. return classify;
  250. }
  251. return classify;
  252. }
  253. public String getWeChatArticleContent(String articleUrl, String cookie) {
  254. String content = "";
  255. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  256. HttpGet request = new HttpGet(articleUrl);
  257. // 设置完整的请求头(关键步骤!)
  258. request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
  259. request.setHeader("Referer", "https://mp.weixin.qq.com/");
  260. request.setHeader("Cookie", cookie);
  261. //执行请求并处理响应
  262. try (CloseableHttpResponse response = httpClient.execute(request)) {
  263. final int code = response.getStatusLine().getStatusCode();
  264. // 检查响应状态码
  265. if (code != 200) {
  266. System.err.println("请求失败,状态码: " + code);
  267. return content;
  268. }
  269. // 4. 解析HTML内容
  270. String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
  271. Document doc = Jsoup.parse(htmlContent);
  272. // 5. 提取文章正文(微信使用特定class)
  273. Element contentElement = doc.selectFirst("#js_content");
  274. if (contentElement != null) {
  275. List<String> list = new ArrayList<>();
  276. Elements elements = contentElement.select("p");
  277. for (Element element : elements) {
  278. String text = element.text();
  279. if (StringUtils.isNotEmpty(text)) {
  280. list.add(text);
  281. }
  282. }
  283. if (CollectionUtils.isEmpty(list) || list.size() < 5) {
  284. String articleContent = contentElement.text();
  285. list.add(articleContent);
  286. content = StringUtils.join(list, "\n");
  287. } else {
  288. content = StringUtils.join(list, "\n");
  289. }
  290. }
  291. }
  292. } catch (Exception e) {
  293. }
  294. return content;
  295. }
  296. public String getGuid(String url) {
  297. String guid = "";
  298. try {
  299. URL fileUrl = new URL(url);
  300. HttpURLConnection connection = (HttpURLConnection) fileUrl.openConnection();
  301. File tempFile = File.createTempFile("tem-", ".jpeg");
  302. try (InputStream in = connection.getInputStream(); FileOutputStream out = new FileOutputStream(tempFile)) {
  303. IOUtils.copy(in, out);
  304. }
  305. List<String> list = null;
  306. try {
  307. list = fileManagerService.uploadFileGetGuid2(Collections.singletonList(tempFile));
  308. } catch (IOException e) {
  309. list = new ArrayList<>();
  310. }
  311. Files.delete(tempFile.toPath());
  312. if (!CollectionUtils.isEmpty(list)) {
  313. guid = list.get(0);
  314. } else {
  315. guid = url;
  316. }
  317. } catch (Exception e) {
  318. return url;
  319. }
  320. return guid;
  321. }
  322. //测试使用方法1
  323. public void getWeChatArticleContent1(String articleUrl, Integer categoryId, Integer sourceId) {
  324. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  325. HttpGet request = new HttpGet(articleUrl);
  326. // 设置完整的请求头(关键步骤!)
  327. request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
  328. request.setHeader("Referer", "https://mp.weixin.qq.com/");
  329. // request.setHeader("Cookie", cookie);
  330. //执行请求并处理响应
  331. try (CloseableHttpResponse response = httpClient.execute(request)) {
  332. final int code = response.getStatusLine().getStatusCode();
  333. // 检查响应状态码
  334. if (code != 200) {
  335. System.err.println("请求失败,状态码: " + code);
  336. }
  337. // 4. 解析HTML内容
  338. String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
  339. Document doc = Jsoup.parse(htmlContent);
  340. Element h1 = doc.selectFirst("h1");
  341. String title = h1.text();
  342. Elements scripts = doc.select("script");
  343. String createTimeStr = "";
  344. for (Element script : scripts) {
  345. String scriptContent = script.html();
  346. // 提取 var ct 的值(字符串或数字)
  347. createTimeStr = extractVariable(scriptContent, "ct");
  348. if (StringUtils.isNotEmpty(createTimeStr)) {
  349. break;
  350. }
  351. }
  352. Date createTime = new Date();
  353. if (StringUtils.isNotEmpty(createTimeStr)) {
  354. long secondCreateTime = Long.parseLong(createTimeStr);
  355. String createTimeStr1 = DateUtil.convertTimestamp(secondCreateTime);
  356. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  357. try {
  358. createTime = dateFormat.parse(createTimeStr1);
  359. } catch (Exception e) {
  360. }
  361. }
  362. // 5. 提取文章正文(微信使用特定class)
  363. String content = "";
  364. Element contentElement = doc.selectFirst("#js_content");
  365. if (contentElement != null) {
  366. List<String> list = new ArrayList<>();
  367. Elements elements = contentElement.select("p");
  368. for (Element element : elements) {
  369. String text = element.text();
  370. if (StringUtils.isNotEmpty(text)) {
  371. list.add(text);
  372. }
  373. }
  374. if (CollectionUtils.isEmpty(list) || list.size() < 5) {
  375. String articleContent = contentElement.text();
  376. list.add(articleContent);
  377. content = StringUtils.join(list, "\n");
  378. } else {
  379. content = StringUtils.join(list, "\n");
  380. }
  381. }
  382. // String condensedAbstract = difyService.getCondensedAbstract(content);
  383. String condensedAbstract = difyService.getZGYDSummary(content);
  384. ArticleInfo articleInfo = new ArticleInfo();
  385. articleInfo.setTitle(title);
  386. articleInfo.setDigest(condensedAbstract);
  387. articleInfo.setCategoryId(categoryId);
  388. articleInfo.setSourceId(sourceId);
  389. articleInfo.setPublicTime(createTime);
  390. articleInfo.setArticleUrl(articleUrl);
  391. articleInfo.insert();
  392. }
  393. } catch (Exception e) {
  394. }
  395. }
  396. ////测试使用方法2
  397. private static String extractVariable(String scriptContent, String varName) {
  398. // 正则表达式匹配变量赋值(支持字符串或数字)
  399. Pattern pattern = Pattern.compile(
  400. "var\\s+" + varName + "\\s*=\\s*(['\"])?(.*?)\\1\\s*;", // 匹配 var varName = 'value' 或 var varName = 123;
  401. Pattern.DOTALL
  402. );
  403. Matcher matcher = pattern.matcher(scriptContent);
  404. if (matcher.find()) {
  405. return matcher.group(2).trim(); // 返回捕获的值(第二组)
  406. }
  407. return null;
  408. }
  409. }