|
|
@@ -21,154 +21,164 @@ import java.util.regex.Pattern;
|
|
|
|
|
|
@Slf4j
|
|
|
public class SimplePageProcessor {
|
|
|
- private final Site site = Site.me().setDomain("hongdou.gxnews.com.cn").setRetryTimes(3).setSleepTime(1000);
|
|
|
-
|
|
|
- public static void main(String[] args) {
|
|
|
- PopularFeelingsSiteRule rule = new PopularFeelingsSiteRule();
|
|
|
- rule.setStartUrls(Arrays.asList("https://www.cqn.com.cn/"));
|
|
|
- rule.setUrlPatterns("https://www.cqn.com.cn/([a-zA-Z]+)/content/([0-9]+)-([0-9]+)/([0-9]+)/content_([0-9]+).htm");
|
|
|
- rule.setXpath("/html/body/div[4]/div[1]/div[3]/div[1]");
|
|
|
-
|
|
|
- Spider spider = Spider.create(new SimplePageProcessor.PopularFeelingsProcessor(rule));
|
|
|
-
|
|
|
- List<Pipeline> pipelines = new ArrayList<>();
|
|
|
- pipelines.add(new Pipeline() {
|
|
|
- @Override
|
|
|
- public void process(ResultItems resultItems, Task task) {
|
|
|
- Map<String, Object> dataMap = resultItems.getAll();
|
|
|
-
|
|
|
- String url = resultItems.getRequest().getUrl();
|
|
|
- String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
|
|
|
-
|
|
|
- String title = dataMap.getOrDefault("title", "").toString();
|
|
|
- String keywords = dataMap.getOrDefault("keywords", "").toString();
|
|
|
- String description = dataMap.getOrDefault("description", "").toString();
|
|
|
- String bodyText = dataMap.getOrDefault("bodyText", "").toString();
|
|
|
- String html = dataMap.getOrDefault("html", "").toString();
|
|
|
- List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
|
|
|
-
|
|
|
- String[] listKeywords = new String[]{"旅游投诉", "违法", "柳城"};
|
|
|
- log.info(JSON.toJSONString(listFragments));
|
|
|
-
|
|
|
- for (String s : listFragments) {
|
|
|
-
|
|
|
- }
|
|
|
-// boolean flag = true;
|
|
|
-// for (String k : listKeywords) {
|
|
|
-// if (!bodyText.contains(k)) {
|
|
|
-// flag = false;
|
|
|
-// break;
|
|
|
-// }
|
|
|
-// }
|
|
|
- }
|
|
|
- });
|
|
|
- spider.setPipelines(pipelines);
|
|
|
- spider.setExitWhenComplete(true);
|
|
|
- spider.addUrl(rule.getStartUrls().toArray(new String[]{}));
|
|
|
- spider.runAsync();
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- static class PopularFeelingsProcessor implements PageProcessor {
|
|
|
- private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
-
|
|
|
- private final Site site;
|
|
|
-
|
|
|
- private final PopularFeelingsSiteRule rule;
|
|
|
-
|
|
|
-
|
|
|
- public PopularFeelingsProcessor(PopularFeelingsSiteRule rule) {
|
|
|
- this.rule = rule;
|
|
|
- this.site = Site.me();
|
|
|
- this.site.setUseGzip(true);
|
|
|
- this.site.setRetryTimes(3);
|
|
|
- this.site.setSleepTime(1000);
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void process(Page page) {
|
|
|
- List<String> listUrls = page.getHtml().links().all();
|
|
|
- listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
|
|
|
-
|
|
|
- String urlPatterns = this.rule.getUrlPatterns();
|
|
|
- if (StringUtils.isNotBlank(urlPatterns)) {
|
|
|
- String[] listPatterns = urlPatterns.split("\n");
|
|
|
- listUrls.removeIf(url -> {
|
|
|
- boolean hasMatched = false;
|
|
|
- for (String regex : listPatterns) {
|
|
|
- hasMatched = Pattern.matches(regex, url);
|
|
|
- if (hasMatched) {
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- return !hasMatched;
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
-
|
|
|
- page.addTargetRequests(listUrls);
|
|
|
-
|
|
|
- Document document = page.getHtml().getDocument();
|
|
|
-
|
|
|
- String title = document.title();
|
|
|
- String bodyText = document.text();
|
|
|
- String html = document.html();
|
|
|
-
|
|
|
- String keywords = "";
|
|
|
- String description = "";
|
|
|
- Elements elements = document.getElementsByTag("meta");
|
|
|
-
|
|
|
- if (elements != null && elements.size() > 0) {
|
|
|
- for (Element element : elements) {
|
|
|
- String metaName = element.attr("name");
|
|
|
- String metaContent = element.attr("content");
|
|
|
- if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
|
|
|
- keywords = metaContent;
|
|
|
- }
|
|
|
- if (StringUtils.equalsIgnoreCase("description", metaName)) {
|
|
|
- description = metaContent;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
|
|
|
- description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
- }
|
|
|
-
|
|
|
- if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
|
|
|
- keywords = description;
|
|
|
- }
|
|
|
-
|
|
|
- Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
|
|
|
- String selectableHtml = selectable.get();
|
|
|
-
|
|
|
- List<String> listFragments = new ArrayList<>();
|
|
|
-
|
|
|
- if (StringUtils.isNotBlank(selectableHtml)) {
|
|
|
- document.html(selectableHtml);
|
|
|
-
|
|
|
- Elements allElements = document.children().first().children();
|
|
|
- for (Element element : allElements) {
|
|
|
- String text = element.text();
|
|
|
- if (StringUtils.isNotBlank(text)) {
|
|
|
- listFragments.add(text.trim());
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- page.putField("title", title);
|
|
|
- page.putField("bodyText", bodyText);
|
|
|
- page.putField("html", html);
|
|
|
- page.putField("keywords", keywords);
|
|
|
- page.putField("description", description);
|
|
|
- page.putField("listFragments", JSON.toJSONString(listFragments));
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public Site getSite() {
|
|
|
- return this.site;
|
|
|
- }
|
|
|
- }
|
|
|
+ public static void main(String[] args) {
|
|
|
+ PopularFeelingsSiteRule rule = new PopularFeelingsSiteRule();
|
|
|
+ rule.setStartUrls(Arrays.asList("https://www.cqn.com.cn/"));
|
|
|
+ rule.setUrlPatterns("https://www.cqn.com.cn/([a-zA-Z]+)/content/([0-9]+)-([0-9]+)/([0-9]+)/content_([0-9]+).htm");
|
|
|
+ rule.setXpath("/html/body/div[4]/div[1]/div[3]/div[1]");
|
|
|
+
|
|
|
+ Spider spider = Spider.create(new SimplePageProcessor.PopularFeelingsProcessor(rule));
|
|
|
+
|
|
|
+ List<Pipeline> pipelines = new ArrayList<>();
|
|
|
+ pipelines.add(new Pipeline() {
|
|
|
+ @Override
|
|
|
+ public void process(ResultItems resultItems, Task task) {
|
|
|
+ Map<String, Object> dataMap = resultItems.getAll();
|
|
|
+
|
|
|
+ String url = resultItems.getRequest().getUrl();
|
|
|
+ String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
|
|
|
+
|
|
|
+ String title = dataMap.getOrDefault("title", "").toString();
|
|
|
+ String keywords = dataMap.getOrDefault("keywords", "").toString();
|
|
|
+ String description = dataMap.getOrDefault("description", "").toString();
|
|
|
+ String bodyText = dataMap.getOrDefault("bodyText", "").toString();
|
|
|
+ String html = dataMap.getOrDefault("html", "").toString();
|
|
|
+ List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
|
|
|
+
|
|
|
+ String[] listKeywords = new String[]{"十三届五次会", "习近平"};
|
|
|
+
|
|
|
+ boolean flag = true;
|
|
|
+ for (String keyword : listKeywords) {
|
|
|
+ if (!bodyText.contains(keyword)) {
|
|
|
+ flag = false;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ listFragments.removeIf(o -> {
|
|
|
+ boolean hasContains = false;
|
|
|
+ for (String keyword : listKeywords) {
|
|
|
+ if (o.contains(keyword)) {
|
|
|
+ hasContains = true;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return !hasContains;
|
|
|
+ });
|
|
|
+
|
|
|
+ log.info("{}>>>>>>>>{}>>>>>>>>{}", flag, listFragments.size(), bodyText);
|
|
|
+ }
|
|
|
+ });
|
|
|
+ spider.setPipelines(pipelines);
|
|
|
+ spider.setExitWhenComplete(true);
|
|
|
+ spider.addUrl(rule.getStartUrls().toArray(new String[]{}));
|
|
|
+ spider.runAsync();
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ static class PopularFeelingsProcessor implements PageProcessor {
|
|
|
+ private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
+
|
|
|
+ private final Site site;
|
|
|
+
|
|
|
+ private final PopularFeelingsSiteRule rule;
|
|
|
+
|
|
|
+
|
|
|
+ public PopularFeelingsProcessor(PopularFeelingsSiteRule rule) {
|
|
|
+ this.rule = rule;
|
|
|
+ this.site = Site.me();
|
|
|
+ this.site.setUseGzip(true);
|
|
|
+ this.site.setRetryTimes(3);
|
|
|
+ this.site.setSleepTime(1000);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void process(Page page) {
|
|
|
+ List<String> listUrls = page.getHtml().links().all();
|
|
|
+ listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
|
|
|
+
|
|
|
+ String urlPatterns = this.rule.getUrlPatterns();
|
|
|
+ if (StringUtils.isNotBlank(urlPatterns)) {
|
|
|
+ String[] listPatterns = urlPatterns.split("\n");
|
|
|
+ listUrls.removeIf(url -> {
|
|
|
+ boolean hasMatched = false;
|
|
|
+ for (String regex : listPatterns) {
|
|
|
+ hasMatched = Pattern.matches(regex, url);
|
|
|
+ if (hasMatched) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return !hasMatched;
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
+
|
|
|
+ page.addTargetRequests(listUrls);
|
|
|
+
|
|
|
+ Document document = page.getHtml().getDocument();
|
|
|
+
|
|
|
+ String title = document.title();
|
|
|
+ String bodyText = "";
|
|
|
+ String html = document.html();
|
|
|
+
|
|
|
+ String keywords = "";
|
|
|
+ String description = "";
|
|
|
+ Elements elements = document.getElementsByTag("meta");
|
|
|
+
|
|
|
+ if (elements != null && elements.size() > 0) {
|
|
|
+ for (Element element : elements) {
|
|
|
+ String metaName = element.attr("name");
|
|
|
+ String metaContent = element.attr("content");
|
|
|
+ if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
|
|
|
+ keywords = metaContent;
|
|
|
+ }
|
|
|
+ if (StringUtils.equalsIgnoreCase("description", metaName)) {
|
|
|
+ description = metaContent;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
|
|
|
+
|
|
|
+ List<String> listFragments = new ArrayList<>();
|
|
|
+ if (selectable.match()) {
|
|
|
+ document.html(selectable.get());
|
|
|
+ bodyText = document.text();
|
|
|
+
|
|
|
+ Elements allElements = document.children().first().children();
|
|
|
+ for (Element element : allElements) {
|
|
|
+ String text = element.text();
|
|
|
+ if (StringUtils.isNotBlank(text)) {
|
|
|
+ listFragments.add(text.trim());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (StringUtils.isBlank(bodyText)) {
|
|
|
+ page.setSkip(true);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
|
|
|
+ description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
|
|
|
+ keywords = description;
|
|
|
+ }
|
|
|
+
|
|
|
+ page.putField("title", title);
|
|
|
+ page.putField("bodyText", bodyText);
|
|
|
+ page.putField("html", html);
|
|
|
+ page.putField("keywords", keywords);
|
|
|
+ page.putField("description", description);
|
|
|
+ page.putField("listFragments", JSON.toJSONString(listFragments));
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public Site getSite() {
|
|
|
+ return this.site;
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|