|
|
@@ -1,38 +1,32 @@
|
|
|
package com.zhiqiyun.open.core.service.impl;
|
|
|
|
|
|
-import com.alibaba.fastjson.JSON;
|
|
|
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
|
|
|
import com.zhiqiyun.open.core.mapper.sentiment.SentimentSpiderEventMapper;
|
|
|
import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderEvent;
|
|
|
-import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderResult;
|
|
|
import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderSiteRule;
|
|
|
import com.zhiqiyun.open.core.service.SentimentSpiderEventService;
|
|
|
import com.zhiqiyun.open.core.service.SentimentSpiderResultService;
|
|
|
-import com.zhiqiyun.open.core.spiders.OkHttpDownloader;
|
|
|
-import com.zhiqiyun.open.utils.DateUtil;
|
|
|
+import com.zhiqiyun.open.core.spiders.HtmlPageProcessor;
|
|
|
+import com.zhiqiyun.open.core.spiders.SpiderEventPipeline;
|
|
|
+import com.zhiqiyun.open.core.spiders.WeiboPageProcessor;
|
|
|
+import com.zhiqiyun.open.core.spiders.WeiboPipeline;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
-import org.apache.commons.codec.digest.DigestUtils;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
import org.jetbrains.annotations.NotNull;
|
|
|
-import org.jsoup.nodes.Document;
|
|
|
-import org.jsoup.nodes.Element;
|
|
|
-import org.jsoup.select.Elements;
|
|
|
import org.springframework.beans.BeansException;
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
import org.springframework.context.ApplicationContext;
|
|
|
import org.springframework.context.ApplicationContextAware;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
-import us.codecraft.webmagic.*;
|
|
|
+import us.codecraft.webmagic.Spider;
|
|
|
import us.codecraft.webmagic.downloader.Downloader;
|
|
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
|
|
import us.codecraft.webmagic.processor.PageProcessor;
|
|
|
-import us.codecraft.webmagic.selector.Selectable;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
+import java.util.Arrays;
|
|
|
import java.util.List;
|
|
|
-import java.util.Map;
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
-import java.util.regex.Pattern;
|
|
|
|
|
|
@Slf4j
|
|
|
@Service
|
|
|
@@ -60,16 +54,28 @@ public class SentimentSpiderEventServiceImpl extends ServiceImpl<SentimentSpider
|
|
|
}
|
|
|
String[] urlArray = rule.getStartUrls().split("\n");
|
|
|
|
|
|
- Spider spider = Spider.create(new SpiderEventProcessor(rule));
|
|
|
+ PageProcessor processor;
|
|
|
+ Pipeline pipeline;
|
|
|
+ String[] keywords = event.getKeywords().split(",");
|
|
|
+ if ("weibo".equals(rule.getCollectorName())) {
|
|
|
+ processor = new WeiboPageProcessor(rule);
|
|
|
+ pipeline = new WeiboPipeline(keywords, event.getId(), event.getSiteRuleId(), this.sentimentSpiderResultService);
|
|
|
+ } else {
|
|
|
+ processor = new HtmlPageProcessor(rule);
|
|
|
+ pipeline = new SpiderEventPipeline(keywords, event.getId(), event.getSiteRuleId(), this.sentimentSpiderResultService);
|
|
|
+ }
|
|
|
|
|
|
+ Spider spider = Spider.create(processor);
|
|
|
List<Pipeline> pipelines = new ArrayList<>();
|
|
|
|
|
|
- String[] keywords = event.getKeywords().split(",");
|
|
|
- pipelines.add(new SpiderEventPipeline(keywords, event.getId(), event.getSiteRuleId(), this.sentimentSpiderResultService));
|
|
|
+ pipelines.add(pipeline);
|
|
|
spider.setPipelines(pipelines);
|
|
|
spider.setExitWhenComplete(true);
|
|
|
- Downloader downloader = this.applicationContext.getBean(OkHttpDownloader.class);
|
|
|
- spider.setDownloader(downloader);
|
|
|
+ if (!Arrays.asList("html", "weibo").contains(rule.getCollectorName())) {
|
|
|
+ String beanName = String.format("%s.downloader", rule.getCollectorName());
|
|
|
+ Downloader downloader = this.applicationContext.getBean(beanName, Downloader.class);
|
|
|
+ spider.setDownloader(downloader);
|
|
|
+ }
|
|
|
spider.addUrl(urlArray);
|
|
|
spider.runAsync();
|
|
|
|
|
|
@@ -99,183 +105,135 @@ public class SentimentSpiderEventServiceImpl extends ServiceImpl<SentimentSpider
|
|
|
}
|
|
|
|
|
|
|
|
|
- public static class SpiderEventPipeline implements Pipeline {
|
|
|
-
|
|
|
- private final String[] keywords;
|
|
|
- private final Long eventId;
|
|
|
- private final Long siteRuleId;
|
|
|
-
|
|
|
- private final SentimentSpiderResultService sentimentSpiderResultService;
|
|
|
-
|
|
|
- public SpiderEventPipeline(String[] keywords, Long eventId, Long siteRuleId, SentimentSpiderResultService sentimentSpiderResultService) {
|
|
|
- this.sentimentSpiderResultService = sentimentSpiderResultService;
|
|
|
- this.keywords = keywords;
|
|
|
- this.eventId = eventId;
|
|
|
- this.siteRuleId = siteRuleId;
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void process(ResultItems resultItems, Task task) {
|
|
|
- Map<String, Object> dataMap = resultItems.getAll();
|
|
|
-
|
|
|
- String url = resultItems.getRequest().getUrl();
|
|
|
- String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
|
|
|
-
|
|
|
- String title = dataMap.getOrDefault("title", "").toString();
|
|
|
- String keywords = dataMap.getOrDefault("keywords", "").toString();
|
|
|
- String description = dataMap.getOrDefault("description", "").toString();
|
|
|
- String bodyText = dataMap.getOrDefault("bodyText", "").toString();
|
|
|
- String html = dataMap.getOrDefault("html", "").toString();
|
|
|
- List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
|
|
|
-
|
|
|
-
|
|
|
- boolean flag = true;
|
|
|
- for (String keyword : this.keywords) {
|
|
|
- if (!bodyText.contains(keyword)) {
|
|
|
- flag = false;
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- List<String> fragments = new ArrayList<>();
|
|
|
- for (String s : listFragments) {
|
|
|
- boolean isContains = false;
|
|
|
- for (String keyword : this.keywords) {
|
|
|
- if (s.contains(keyword)) {
|
|
|
- isContains = true;
|
|
|
- s = s.replace(keyword, "<em>" + keyword + "</em>");
|
|
|
- }
|
|
|
- }
|
|
|
- if (isContains) {
|
|
|
- fragments.add(s);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- log.info("{}>>>>>>>>{}>>>>>>>>{}", flag, listFragments.size(), bodyText);
|
|
|
- if (flag) {
|
|
|
- SentimentSpiderResult result = new SentimentSpiderResult();
|
|
|
- result.setId(id);
|
|
|
- result.setEventId(eventId);
|
|
|
- result.setSiteRuleId(siteRuleId);
|
|
|
- result.setUrl(url);
|
|
|
- result.setTitle(title);
|
|
|
- result.setKeywords(keywords);
|
|
|
- result.setDescription(description);
|
|
|
- result.setFragments(fragments);
|
|
|
- result.setBodyText(bodyText);
|
|
|
- result.setHtml(html);
|
|
|
- result.setSpiderTime(DateUtil.current());
|
|
|
- result.setUpdateTime(DateUtil.current());
|
|
|
- this.sentimentSpiderResultService.saveOrUpdate(result);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- public static class SpiderEventProcessor implements PageProcessor {
|
|
|
- private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
-
|
|
|
- private final Site site;
|
|
|
-
|
|
|
- private final SentimentSpiderSiteRule rule;
|
|
|
-
|
|
|
-
|
|
|
- public SpiderEventProcessor(SentimentSpiderSiteRule rule) {
|
|
|
- this.rule = rule;
|
|
|
- this.site = Site.me();
|
|
|
- this.site.setUseGzip(true);
|
|
|
- this.site.setRetryTimes(3);
|
|
|
- this.site.setSleepTime(1000);
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void process(Page page) {
|
|
|
- List<String> listUrls = page.getHtml().links().all();
|
|
|
-
|
|
|
- listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
|
|
|
-
|
|
|
- String urlPatterns = this.rule.getUrlPatterns();
|
|
|
- if (StringUtils.isNotBlank(urlPatterns)) {
|
|
|
- String[] listPatterns = urlPatterns.split("\n");
|
|
|
- listUrls.removeIf(url -> {
|
|
|
- boolean hasMatched = false;
|
|
|
- for (String regex : listPatterns) {
|
|
|
- hasMatched = Pattern.matches(regex, url);
|
|
|
- if (hasMatched) {
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- return !hasMatched;
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
-
|
|
|
- page.addTargetRequests(listUrls);
|
|
|
-
|
|
|
- Document document = page.getHtml().getDocument();
|
|
|
-
|
|
|
- String title = document.title();
|
|
|
- String bodyText = "";
|
|
|
- String html = document.html();
|
|
|
-
|
|
|
- String keywords = "";
|
|
|
- String description = "";
|
|
|
- Elements elements = document.getElementsByTag("meta");
|
|
|
-
|
|
|
- if (elements != null && elements.size() > 0) {
|
|
|
- for (Element element : elements) {
|
|
|
- String metaName = element.attr("name");
|
|
|
- String metaContent = element.attr("content");
|
|
|
- if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
|
|
|
- keywords = metaContent;
|
|
|
- }
|
|
|
- if (StringUtils.equalsIgnoreCase("description", metaName)) {
|
|
|
- description = metaContent;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
|
|
|
-
|
|
|
- List<String> listFragments = new ArrayList<>();
|
|
|
- if (selectable.match()) {
|
|
|
- document.html(selectable.get());
|
|
|
- bodyText = document.text();
|
|
|
-
|
|
|
- Elements allElements = document.children().first().children();
|
|
|
- for (Element element : allElements) {
|
|
|
- String text = element.text();
|
|
|
- if (StringUtils.isNotBlank(text)) {
|
|
|
- listFragments.add(text.trim());
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (StringUtils.isBlank(bodyText)) {
|
|
|
- page.setSkip(true);
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
|
|
|
- description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
- }
|
|
|
-
|
|
|
- if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
|
|
|
- keywords = description;
|
|
|
- }
|
|
|
-
|
|
|
- page.putField("title", title);
|
|
|
- page.putField("bodyText", bodyText);
|
|
|
- page.putField("html", html);
|
|
|
- page.putField("keywords", keywords);
|
|
|
- page.putField("description", description);
|
|
|
- page.putField("listFragments", JSON.toJSONString(listFragments));
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public Site getSite() {
|
|
|
- return this.site;
|
|
|
- }
|
|
|
- }
|
|
|
+// public static class SpiderEventProcessor implements PageProcessor {
|
|
|
+// private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
+//
|
|
|
+// private final Site site;
|
|
|
+//
|
|
|
+// private final SentimentSpiderSiteRule rule;
|
|
|
+//
|
|
|
+//
|
|
|
+// public SpiderEventProcessor(SentimentSpiderSiteRule rule) {
|
|
|
+// this.rule = rule;
|
|
|
+// this.site = Site.me();
|
|
|
+// this.site.setUseGzip(true);
|
|
|
+// this.site.setRetryTimes(3);
|
|
|
+// this.site.setSleepTime(1000);
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public void process(Page page) {
|
|
|
+// List<String> listUrls = page.getHtml().links().all();
|
|
|
+//
|
|
|
+// listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
|
|
|
+//
|
|
|
+// List<UrlPatterns> listUrlPatterns = this.rule.getUrlPatterns();
|
|
|
+//
|
|
|
+// AtomicReference<UrlPatterns> matchedUrlPattern = new AtomicReference<>(null);
|
|
|
+// if (listUrlPatterns != null) {
|
|
|
+// listUrls.removeIf(url -> {
|
|
|
+// log.info(url);
|
|
|
+// boolean hasMatched = false;
|
|
|
+// for (UrlPatterns urlPattern : listUrlPatterns) {
|
|
|
+// hasMatched = Pattern.matches(urlPattern.getUrlPattern(), url);
|
|
|
+// if (hasMatched) {
|
|
|
+// break;
|
|
|
+// }
|
|
|
+// }
|
|
|
+// return !hasMatched;
|
|
|
+// });
|
|
|
+//
|
|
|
+// for (UrlPatterns urlPattern : listUrlPatterns) {
|
|
|
+// if (Pattern.matches(urlPattern.getUrlPattern(), page.getUrl().get())) {
|
|
|
+// matchedUrlPattern.set(urlPattern);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
+//
|
|
|
+// page.addTargetRequests(listUrls);
|
|
|
+//
|
|
|
+// Document document = page.getHtml().getDocument();
|
|
|
+//
|
|
|
+// String title = document.title();
|
|
|
+// String bodyText = "";
|
|
|
+// String html = document.html();
|
|
|
+//
|
|
|
+// String keywords = "";
|
|
|
+// String description = "";
|
|
|
+// Elements elements = document.getElementsByTag("meta");
|
|
|
+//
|
|
|
+// if (elements != null && elements.size() > 0) {
|
|
|
+// for (Element element : elements) {
|
|
|
+// String metaName = element.attr("name");
|
|
|
+// String metaContent = element.attr("content");
|
|
|
+// if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
|
|
|
+// keywords = metaContent;
|
|
|
+// }
|
|
|
+// if (StringUtils.equalsIgnoreCase("description", metaName)) {
|
|
|
+// description = metaContent;
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// Selectable selectable;
|
|
|
+// UrlPatterns urlPatterns = null;
|
|
|
+// if (matchedUrlPattern.get() != null) {
|
|
|
+// urlPatterns = matchedUrlPattern.get();
|
|
|
+// }
|
|
|
+//
|
|
|
+// if (urlPatterns == null) {
|
|
|
+// selectable = page.getHtml();
|
|
|
+// } else if ("XpathSelector".equals(urlPatterns.getSelectorName())) {
|
|
|
+// XpathSelector xpathSelector = new XpathSelector(urlPatterns.getSelectorText());
|
|
|
+// selectable = page.getHtml().select(xpathSelector);
|
|
|
+// } else if ("CssSelector".equals(urlPatterns.getSelectorName())) {
|
|
|
+// CssSelector cssSelector = new CssSelector(urlPatterns.getSelectorText());
|
|
|
+// selectable = page.getHtml().select(cssSelector);
|
|
|
+// } else {
|
|
|
+// selectable = page.getHtml();
|
|
|
+// }
|
|
|
+//
|
|
|
+// List<String> listFragments = new ArrayList<>();
|
|
|
+// if (selectable.match()) {
|
|
|
+// document.html(selectable.get());
|
|
|
+// bodyText = document.text();
|
|
|
+//
|
|
|
+// Elements allElements = document.children().first().children();
|
|
|
+// for (Element element : allElements) {
|
|
|
+// String text = element.text();
|
|
|
+// if (StringUtils.isNotBlank(text)) {
|
|
|
+// listFragments.add(text.trim());
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// log.info("{}>>>>>>>>{}>>>>>>>>{}", page.getUrl(), listFragments.size(), bodyText);
|
|
|
+//
|
|
|
+// if (StringUtils.isBlank(bodyText)) {
|
|
|
+// page.setSkip(true);
|
|
|
+// }
|
|
|
+//
|
|
|
+//
|
|
|
+// if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
|
|
|
+// description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
+// }
|
|
|
+//
|
|
|
+// if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
|
|
|
+// keywords = description;
|
|
|
+// }
|
|
|
+//
|
|
|
+// page.putField("title", title);
|
|
|
+// page.putField("bodyText", bodyText);
|
|
|
+// page.putField("html", html);
|
|
|
+// page.putField("keywords", keywords);
|
|
|
+// page.putField("description", description);
|
|
|
+// page.putField("listFragments", JSON.toJSONString(listFragments));
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public Site getSite() {
|
|
|
+// return this.site;
|
|
|
+// }
|
|
|
+// }
|
|
|
}
|