|
|
@@ -1,44 +1,174 @@
|
|
|
package com.zhiqiyun;
|
|
|
|
|
|
+import com.alibaba.fastjson.JSON;
|
|
|
+import com.zhiqiyun.open.core.models.statistics.PopularFeelingsSiteRule;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.apache.commons.codec.digest.DigestUtils;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
-import us.codecraft.webmagic.Page;
|
|
|
-import us.codecraft.webmagic.Site;
|
|
|
-import us.codecraft.webmagic.Spider;
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
+import org.jsoup.select.Elements;
|
|
|
+import us.codecraft.webmagic.*;
|
|
|
+import us.codecraft.webmagic.pipeline.Pipeline;
|
|
|
import us.codecraft.webmagic.processor.PageProcessor;
|
|
|
+import us.codecraft.webmagic.selector.Selectable;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
+import java.util.Arrays;
|
|
|
import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
|
|
@Slf4j
|
|
|
-public class SimplePageProcessor implements PageProcessor {
|
|
|
- private final Site site = Site.me().setDomain("news.qq.com").setRetryTimes(3).setSleepTime(1000);
|
|
|
+public class SimplePageProcessor {
|
|
|
+ private final Site site = Site.me().setDomain("hongdou.gxnews.com.cn").setRetryTimes(3).setSleepTime(1000);
|
|
|
|
|
|
- private static final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
+ public static void main(String[] args) {
|
|
|
+ PopularFeelingsSiteRule rule = new PopularFeelingsSiteRule();
|
|
|
+ rule.setStartUrls(Arrays.asList("https://www.cqn.com.cn/"));
|
|
|
+ rule.setUrlPatterns("https://www.cqn.com.cn/([a-zA-Z]+)/content/([0-9]+)-([0-9]+)/([0-9]+)/content_([0-9]+).htm");
|
|
|
+ rule.setXpath("/html/body/div[4]/div[1]/div[3]/div[1]");
|
|
|
|
|
|
- public SimplePageProcessor() {
|
|
|
- }
|
|
|
+ Spider spider = Spider.create(new SimplePageProcessor.PopularFeelingsProcessor(rule));
|
|
|
+
|
|
|
+ List<Pipeline> pipelines = new ArrayList<>();
|
|
|
+ pipelines.add(new Pipeline() {
|
|
|
+ @Override
|
|
|
+ public void process(ResultItems resultItems, Task task) {
|
|
|
+ Map<String, Object> dataMap = resultItems.getAll();
|
|
|
+
|
|
|
+ String url = resultItems.getRequest().getUrl();
|
|
|
+ String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
|
|
|
+
|
|
|
+ String title = dataMap.getOrDefault("title", "").toString();
|
|
|
+ String keywords = dataMap.getOrDefault("keywords", "").toString();
|
|
|
+ String description = dataMap.getOrDefault("description", "").toString();
|
|
|
+ String bodyText = dataMap.getOrDefault("bodyText", "").toString();
|
|
|
+ String html = dataMap.getOrDefault("html", "").toString();
|
|
|
+ List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
|
|
|
|
|
|
- public void process(Page page) {
|
|
|
- List<String> listUrls = page.getHtml().links().all();
|
|
|
- listUrls.removeIf(s -> {
|
|
|
- return LIST_SPIDER_URLS.contains(s) || StringUtils.isBlank(s);
|
|
|
+ String[] listKeywords = new String[]{"旅游投诉", "违法", "柳城"};
|
|
|
+ log.info(JSON.toJSONString(listFragments));
|
|
|
+
|
|
|
+ for (String s : listFragments) {
|
|
|
+
|
|
|
+ }
|
|
|
+// boolean flag = true;
|
|
|
+// for (String k : listKeywords) {
|
|
|
+// if (!bodyText.contains(k)) {
|
|
|
+// flag = false;
|
|
|
+// break;
|
|
|
+// }
|
|
|
+// }
|
|
|
+ }
|
|
|
});
|
|
|
- LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
- log.info(">>>>>>>>{}", LIST_SPIDER_URLS.size());
|
|
|
- page.addTargetRequests(listUrls);
|
|
|
+ spider.setPipelines(pipelines);
|
|
|
+ spider.setExitWhenComplete(true);
|
|
|
+ spider.addUrl(rule.getStartUrls().toArray(new String[]{}));
|
|
|
+ spider.runAsync();
|
|
|
+ }
|
|
|
+
|
|
|
|
|
|
- page.putField("title", page.getHtml().getDocument().title());
|
|
|
- if (page.getResultItems().get("title") == null) {
|
|
|
- page.setSkip(true);
|
|
|
+ static class PopularFeelingsProcessor implements PageProcessor {
|
|
|
+ private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
+
|
|
|
+ private final Site site;
|
|
|
+
|
|
|
+ private final PopularFeelingsSiteRule rule;
|
|
|
+
|
|
|
+
|
|
|
+ public PopularFeelingsProcessor(PopularFeelingsSiteRule rule) {
|
|
|
+ this.rule = rule;
|
|
|
+ this.site = Site.me();
|
|
|
+ this.site.setUseGzip(true);
|
|
|
+ this.site.setRetryTimes(3);
|
|
|
+ this.site.setSleepTime(1000);
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- public Site getSite() {
|
|
|
- return this.site;
|
|
|
- }
|
|
|
+ @Override
|
|
|
+ public void process(Page page) {
|
|
|
+ List<String> listUrls = page.getHtml().links().all();
|
|
|
+ listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
|
|
|
|
|
|
- public static void main(String[] args) {
|
|
|
- Spider.create(new SimplePageProcessor()).addUrl("https://news.qq.com/").run();
|
|
|
+ String urlPatterns = this.rule.getUrlPatterns();
|
|
|
+ if (StringUtils.isNotBlank(urlPatterns)) {
|
|
|
+ String[] listPatterns = urlPatterns.split("\n");
|
|
|
+ listUrls.removeIf(url -> {
|
|
|
+ boolean hasMatched = false;
|
|
|
+ for (String regex : listPatterns) {
|
|
|
+ hasMatched = Pattern.matches(regex, url);
|
|
|
+ if (hasMatched) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return !hasMatched;
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
+
|
|
|
+ page.addTargetRequests(listUrls);
|
|
|
+
|
|
|
+ Document document = page.getHtml().getDocument();
|
|
|
+
|
|
|
+ String title = document.title();
|
|
|
+ String bodyText = document.text();
|
|
|
+ String html = document.html();
|
|
|
+
|
|
|
+ String keywords = "";
|
|
|
+ String description = "";
|
|
|
+ Elements elements = document.getElementsByTag("meta");
|
|
|
+
|
|
|
+ if (elements != null && elements.size() > 0) {
|
|
|
+ for (Element element : elements) {
|
|
|
+ String metaName = element.attr("name");
|
|
|
+ String metaContent = element.attr("content");
|
|
|
+ if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
|
|
|
+ keywords = metaContent;
|
|
|
+ }
|
|
|
+ if (StringUtils.equalsIgnoreCase("description", metaName)) {
|
|
|
+ description = metaContent;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
|
|
|
+ description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
|
|
|
+ keywords = description;
|
|
|
+ }
|
|
|
+
|
|
|
+ Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
|
|
|
+ String selectableHtml = selectable.get();
|
|
|
+
|
|
|
+ List<String> listFragments = new ArrayList<>();
|
|
|
+
|
|
|
+ if (StringUtils.isNotBlank(selectableHtml)) {
|
|
|
+ document.html(selectableHtml);
|
|
|
+
|
|
|
+ Elements allElements = document.children().first().children();
|
|
|
+ for (Element element : allElements) {
|
|
|
+ String text = element.text();
|
|
|
+ if (StringUtils.isNotBlank(text)) {
|
|
|
+ listFragments.add(text.trim());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ page.putField("title", title);
|
|
|
+ page.putField("bodyText", bodyText);
|
|
|
+ page.putField("html", html);
|
|
|
+ page.putField("keywords", keywords);
|
|
|
+ page.putField("description", description);
|
|
|
+ page.putField("listFragments", JSON.toJSONString(listFragments));
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public Site getSite() {
|
|
|
+ return this.site;
|
|
|
+ }
|
|
|
}
|
|
|
}
|