jtoms 4 éve
szülő
commit
40eafaa246
18 módosított fájl, 728 hozzáadás és 477 törlés
  1. 1 1
      src/main/java/com/zhiqiyun/open/core/models/equipment/EquipmentPassenger.java
  2. 1 1
      src/main/java/com/zhiqiyun/open/core/models/place/PlaceBaseInfo.java
  3. 1 1
      src/main/java/com/zhiqiyun/open/core/models/sentiment/SentimentSpiderResult.java
  4. 13 12
      src/main/java/com/zhiqiyun/open/core/models/sentiment/SentimentSpiderSiteRule.java
  5. 13 0
      src/main/java/com/zhiqiyun/open/core/models/sentiment/UrlPatterns.java
  6. 1 1
      src/main/java/com/zhiqiyun/open/core/models/statistics/ComplaintInfo.java
  7. 154 196
      src/main/java/com/zhiqiyun/open/core/service/impl/SentimentSpiderEventServiceImpl.java
  8. 154 0
      src/main/java/com/zhiqiyun/open/core/spiders/HtmlPageProcessor.java
  9. 0 153
      src/main/java/com/zhiqiyun/open/core/spiders/OkHttpDownloader.java
  10. 87 0
      src/main/java/com/zhiqiyun/open/core/spiders/SpiderEventPipeline.java
  11. 103 0
      src/main/java/com/zhiqiyun/open/core/spiders/WeiboPageProcessor.java
  12. 100 0
      src/main/java/com/zhiqiyun/open/core/spiders/WeiboPipeline.java
  13. 0 46
      src/main/java/com/zhiqiyun/open/core/typeHandler/FastjsonTypeHandler.java
  14. 19 0
      src/main/java/com/zhiqiyun/open/core/typeHandler/UrlPatternTypeHandler.java
  15. 9 5
      src/main/java/com/zhiqiyun/open/mvc/params/sentiment/SaveSpiderSiteRuleParam.java
  16. 2 0
      src/main/resources/application-prod.properties
  17. 21 0
      src/main/resources/db/migration/V1.1.3__update_sentiment_spider.sql
  18. 49 61
      src/test/java/com/zhiqiyun/SimplePageProcessor.java

+ 1 - 1
src/main/java/com/zhiqiyun/open/core/models/equipment/EquipmentPassenger.java

@@ -1,8 +1,8 @@
 package com.zhiqiyun.open.core.models.equipment;
 
 import com.baomidou.mybatisplus.annotation.*;
+import com.baomidou.mybatisplus.extension.handlers.FastjsonTypeHandler;
 import com.zhiqiyun.open.core.models.place.PlaceCategory;
-import com.zhiqiyun.open.core.typeHandler.FastjsonTypeHandler;
 import lombok.Data;
 import org.apache.commons.lang3.StringUtils;
 

+ 1 - 1
src/main/java/com/zhiqiyun/open/core/models/place/PlaceBaseInfo.java

@@ -1,7 +1,7 @@
 package com.zhiqiyun.open.core.models.place;
 
 import com.baomidou.mybatisplus.annotation.*;
-import com.zhiqiyun.open.core.typeHandler.FastjsonTypeHandler;
+import com.baomidou.mybatisplus.extension.handlers.FastjsonTypeHandler;
 import lombok.Data;
 import org.apache.commons.lang3.StringUtils;
 

+ 1 - 1
src/main/java/com/zhiqiyun/open/core/models/sentiment/SentimentSpiderResult.java

@@ -3,7 +3,7 @@ package com.zhiqiyun.open.core.models.sentiment;
 import com.baomidou.mybatisplus.annotation.FieldStrategy;
 import com.baomidou.mybatisplus.annotation.TableField;
 import com.baomidou.mybatisplus.annotation.TableName;
-import com.zhiqiyun.open.core.typeHandler.FastjsonTypeHandler;
+import com.baomidou.mybatisplus.extension.handlers.FastjsonTypeHandler;
 import lombok.Data;
 
 import java.util.Date;

+ 13 - 12
src/main/java/com/zhiqiyun/open/core/models/sentiment/SentimentSpiderSiteRule.java

@@ -3,7 +3,7 @@ package com.zhiqiyun.open.core.models.sentiment;
 import com.baomidou.mybatisplus.annotation.FieldStrategy;
 import com.baomidou.mybatisplus.annotation.TableField;
 import com.baomidou.mybatisplus.annotation.TableName;
-import com.zhiqiyun.open.core.typeHandler.FastjsonTypeHandler;
+import com.zhiqiyun.open.core.typeHandler.UrlPatternTypeHandler;
 import lombok.Data;
 
 import java.util.Date;
@@ -12,17 +12,18 @@ import java.util.List;
 @Data
 @TableName(value = "sentiment_spider_site_rule", autoResultMap = true)
 public class SentimentSpiderSiteRule {
-	private Long id;
-	private String siteName;
-	private String urlPatterns;
-	private String startUrls;
-	private String xpath;
+    private Long id;
+    private String siteName;
+    private String collectorName;
+    @TableField(typeHandler = UrlPatternTypeHandler.class)
+    private List<UrlPatterns> urlPatterns;
+    private String startUrls;
 
-	@TableField(updateStrategy = FieldStrategy.NEVER)
-	private Date createdTime;
-	@TableField(updateStrategy = FieldStrategy.NEVER)
-	private Long createdBy;
+    @TableField(updateStrategy = FieldStrategy.NEVER)
+    private Date createdTime;
+    @TableField(updateStrategy = FieldStrategy.NEVER)
+    private Long createdBy;
 
-	private Date updatedTime;
-	private Long updatedBy;
+    private Date updatedTime;
+    private Long updatedBy;
 }

+ 13 - 0
src/main/java/com/zhiqiyun/open/core/models/sentiment/UrlPatterns.java

@@ -0,0 +1,13 @@
+package com.zhiqiyun.open.core.models.sentiment;
+
+import lombok.Data;
+
+import java.io.Serializable;
+
+@Data
+public class UrlPatterns implements Serializable {
+    private static final long serialVersionUID = 5307896585465114135L;
+    private String urlPattern;
+    private String selectorName;
+    private String selectorText;
+}

+ 1 - 1
src/main/java/com/zhiqiyun/open/core/models/statistics/ComplaintInfo.java

@@ -2,8 +2,8 @@ package com.zhiqiyun.open.core.models.statistics;
 
 import com.baomidou.mybatisplus.annotation.TableField;
 import com.baomidou.mybatisplus.annotation.TableName;
+import com.baomidou.mybatisplus.extension.handlers.FastjsonTypeHandler;
 import com.zhiqiyun.open.core.enmus.ComplaintState;
-import com.zhiqiyun.open.core.typeHandler.FastjsonTypeHandler;
 import lombok.Data;
 
 import java.util.Date;

+ 154 - 196
src/main/java/com/zhiqiyun/open/core/service/impl/SentimentSpiderEventServiceImpl.java

@@ -1,38 +1,32 @@
 package com.zhiqiyun.open.core.service.impl;
 
-import com.alibaba.fastjson.JSON;
 import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
 import com.zhiqiyun.open.core.mapper.sentiment.SentimentSpiderEventMapper;
 import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderEvent;
-import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderResult;
 import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderSiteRule;
 import com.zhiqiyun.open.core.service.SentimentSpiderEventService;
 import com.zhiqiyun.open.core.service.SentimentSpiderResultService;
-import com.zhiqiyun.open.core.spiders.OkHttpDownloader;
-import com.zhiqiyun.open.utils.DateUtil;
+import com.zhiqiyun.open.core.spiders.HtmlPageProcessor;
+import com.zhiqiyun.open.core.spiders.SpiderEventPipeline;
+import com.zhiqiyun.open.core.spiders.WeiboPageProcessor;
+import com.zhiqiyun.open.core.spiders.WeiboPipeline;
 import lombok.extern.slf4j.Slf4j;
-import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.jetbrains.annotations.NotNull;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
 import org.springframework.beans.BeansException;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.context.ApplicationContext;
 import org.springframework.context.ApplicationContextAware;
 import org.springframework.stereotype.Service;
-import us.codecraft.webmagic.*;
+import us.codecraft.webmagic.Spider;
 import us.codecraft.webmagic.downloader.Downloader;
 import us.codecraft.webmagic.pipeline.Pipeline;
 import us.codecraft.webmagic.processor.PageProcessor;
-import us.codecraft.webmagic.selector.Selectable;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
-import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.regex.Pattern;
 
 @Slf4j
 @Service
@@ -60,16 +54,28 @@ public class SentimentSpiderEventServiceImpl extends ServiceImpl<SentimentSpider
         }
         String[] urlArray = rule.getStartUrls().split("\n");
 
-        Spider spider = Spider.create(new SpiderEventProcessor(rule));
+        PageProcessor processor;
+        Pipeline pipeline;
+        String[] keywords = event.getKeywords().split(",");
+        if ("weibo".equals(rule.getCollectorName())) {
+            processor = new WeiboPageProcessor(rule);
+            pipeline = new WeiboPipeline(keywords, event.getId(), event.getSiteRuleId(), this.sentimentSpiderResultService);
+        } else {
+            processor = new HtmlPageProcessor(rule);
+            pipeline = new SpiderEventPipeline(keywords, event.getId(), event.getSiteRuleId(), this.sentimentSpiderResultService);
+        }
 
+        Spider spider = Spider.create(processor);
         List<Pipeline> pipelines = new ArrayList<>();
 
-        String[] keywords = event.getKeywords().split(",");
-        pipelines.add(new SpiderEventPipeline(keywords, event.getId(), event.getSiteRuleId(), this.sentimentSpiderResultService));
+        pipelines.add(pipeline);
         spider.setPipelines(pipelines);
         spider.setExitWhenComplete(true);
-        Downloader downloader = this.applicationContext.getBean(OkHttpDownloader.class);
-        spider.setDownloader(downloader);
+        if (!Arrays.asList("html", "weibo").contains(rule.getCollectorName())) {
+            String beanName = String.format("%s.downloader", rule.getCollectorName());
+            Downloader downloader = this.applicationContext.getBean(beanName, Downloader.class);
+            spider.setDownloader(downloader);
+        }
         spider.addUrl(urlArray);
         spider.runAsync();
 
@@ -99,183 +105,135 @@ public class SentimentSpiderEventServiceImpl extends ServiceImpl<SentimentSpider
     }
 
 
-    public static class SpiderEventPipeline implements Pipeline {
-
-        private final String[] keywords;
-        private final Long eventId;
-        private final Long siteRuleId;
-
-        private final SentimentSpiderResultService sentimentSpiderResultService;
-
-        public SpiderEventPipeline(String[] keywords, Long eventId, Long siteRuleId, SentimentSpiderResultService sentimentSpiderResultService) {
-            this.sentimentSpiderResultService = sentimentSpiderResultService;
-            this.keywords = keywords;
-            this.eventId = eventId;
-            this.siteRuleId = siteRuleId;
-        }
-
-        @Override
-        public void process(ResultItems resultItems, Task task) {
-            Map<String, Object> dataMap = resultItems.getAll();
-
-            String url = resultItems.getRequest().getUrl();
-            String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
-
-            String title = dataMap.getOrDefault("title", "").toString();
-            String keywords = dataMap.getOrDefault("keywords", "").toString();
-            String description = dataMap.getOrDefault("description", "").toString();
-            String bodyText = dataMap.getOrDefault("bodyText", "").toString();
-            String html = dataMap.getOrDefault("html", "").toString();
-            List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
-
-
-            boolean flag = true;
-            for (String keyword : this.keywords) {
-                if (!bodyText.contains(keyword)) {
-                    flag = false;
-                    break;
-                }
-            }
-
-
-            List<String> fragments = new ArrayList<>();
-            for (String s : listFragments) {
-                boolean isContains = false;
-                for (String keyword : this.keywords) {
-                    if (s.contains(keyword)) {
-                        isContains = true;
-                        s = s.replace(keyword, "<em>" + keyword + "</em>");
-                    }
-                }
-                if (isContains) {
-                    fragments.add(s);
-                }
-            }
-
-            log.info("{}>>>>>>>>{}>>>>>>>>{}", flag, listFragments.size(), bodyText);
-            if (flag) {
-                SentimentSpiderResult result = new SentimentSpiderResult();
-                result.setId(id);
-                result.setEventId(eventId);
-                result.setSiteRuleId(siteRuleId);
-                result.setUrl(url);
-                result.setTitle(title);
-                result.setKeywords(keywords);
-                result.setDescription(description);
-                result.setFragments(fragments);
-                result.setBodyText(bodyText);
-                result.setHtml(html);
-                result.setSpiderTime(DateUtil.current());
-                result.setUpdateTime(DateUtil.current());
-                this.sentimentSpiderResultService.saveOrUpdate(result);
-            }
-        }
-    }
-
-    public static class SpiderEventProcessor implements PageProcessor {
-        private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
-
-        private final Site site;
-
-        private final SentimentSpiderSiteRule rule;
-
-
-        public SpiderEventProcessor(SentimentSpiderSiteRule rule) {
-            this.rule = rule;
-            this.site = Site.me();
-            this.site.setUseGzip(true);
-            this.site.setRetryTimes(3);
-            this.site.setSleepTime(1000);
-        }
-
-        @Override
-        public void process(Page page) {
-            List<String> listUrls = page.getHtml().links().all();
-
-            listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
-
-            String urlPatterns = this.rule.getUrlPatterns();
-            if (StringUtils.isNotBlank(urlPatterns)) {
-                String[] listPatterns = urlPatterns.split("\n");
-                listUrls.removeIf(url -> {
-                    boolean hasMatched = false;
-                    for (String regex : listPatterns) {
-                        hasMatched = Pattern.matches(regex, url);
-                        if (hasMatched) {
-                            break;
-                        }
-                    }
-                    return !hasMatched;
-                });
-            }
-
-            LIST_SPIDER_URLS.addAll(listUrls);
-
-            page.addTargetRequests(listUrls);
-
-            Document document = page.getHtml().getDocument();
-
-            String title = document.title();
-            String bodyText = "";
-            String html = document.html();
-
-            String keywords = "";
-            String description = "";
-            Elements elements = document.getElementsByTag("meta");
-
-            if (elements != null && elements.size() > 0) {
-                for (Element element : elements) {
-                    String metaName = element.attr("name");
-                    String metaContent = element.attr("content");
-                    if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
-                        keywords = metaContent;
-                    }
-                    if (StringUtils.equalsIgnoreCase("description", metaName)) {
-                        description = metaContent;
-                    }
-                }
-            }
-
-            Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
-
-            List<String> listFragments = new ArrayList<>();
-            if (selectable.match()) {
-                document.html(selectable.get());
-                bodyText = document.text();
-
-                Elements allElements = document.children().first().children();
-                for (Element element : allElements) {
-                    String text = element.text();
-                    if (StringUtils.isNotBlank(text)) {
-                        listFragments.add(text.trim());
-                    }
-                }
-            }
-
-            if (StringUtils.isBlank(bodyText)) {
-                page.setSkip(true);
-            }
-
-
-            if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
-                description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
-            }
-
-            if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
-                keywords = description;
-            }
-
-            page.putField("title", title);
-            page.putField("bodyText", bodyText);
-            page.putField("html", html);
-            page.putField("keywords", keywords);
-            page.putField("description", description);
-            page.putField("listFragments", JSON.toJSONString(listFragments));
-        }
-
-        @Override
-        public Site getSite() {
-            return this.site;
-        }
-    }
+//    public static class SpiderEventProcessor implements PageProcessor {
+//        private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
+//
+//        private final Site site;
+//
+//        private final SentimentSpiderSiteRule rule;
+//
+//
+//        public SpiderEventProcessor(SentimentSpiderSiteRule rule) {
+//            this.rule = rule;
+//            this.site = Site.me();
+//            this.site.setUseGzip(true);
+//            this.site.setRetryTimes(3);
+//            this.site.setSleepTime(1000);
+//        }
+//
+//        @Override
+//        public void process(Page page) {
+//            List<String> listUrls = page.getHtml().links().all();
+//
+//            listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
+//
+//            List<UrlPatterns> listUrlPatterns = this.rule.getUrlPatterns();
+//
+//            AtomicReference<UrlPatterns> matchedUrlPattern = new AtomicReference<>(null);
+//            if (listUrlPatterns != null) {
+//                listUrls.removeIf(url -> {
+//                    log.info(url);
+//                    boolean hasMatched = false;
+//                    for (UrlPatterns urlPattern : listUrlPatterns) {
+//                        hasMatched = Pattern.matches(urlPattern.getUrlPattern(), url);
+//                        if (hasMatched) {
+//                            break;
+//                        }
+//                    }
+//                    return !hasMatched;
+//                });
+//
+//                for (UrlPatterns urlPattern : listUrlPatterns) {
+//                    if (Pattern.matches(urlPattern.getUrlPattern(), page.getUrl().get())) {
+//                        matchedUrlPattern.set(urlPattern);
+//                    }
+//                }
+//            }
+//
+//            LIST_SPIDER_URLS.addAll(listUrls);
+//
+//            page.addTargetRequests(listUrls);
+//
+//            Document document = page.getHtml().getDocument();
+//
+//            String title = document.title();
+//            String bodyText = "";
+//            String html = document.html();
+//
+//            String keywords = "";
+//            String description = "";
+//            Elements elements = document.getElementsByTag("meta");
+//
+//            if (elements != null && elements.size() > 0) {
+//                for (Element element : elements) {
+//                    String metaName = element.attr("name");
+//                    String metaContent = element.attr("content");
+//                    if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
+//                        keywords = metaContent;
+//                    }
+//                    if (StringUtils.equalsIgnoreCase("description", metaName)) {
+//                        description = metaContent;
+//                    }
+//                }
+//            }
+//            Selectable selectable;
+//            UrlPatterns urlPatterns = null;
+//            if (matchedUrlPattern.get() != null) {
+//                urlPatterns = matchedUrlPattern.get();
+//            }
+//
+//            if (urlPatterns == null) {
+//                selectable = page.getHtml();
+//            } else if ("XpathSelector".equals(urlPatterns.getSelectorName())) {
+//                XpathSelector xpathSelector = new XpathSelector(urlPatterns.getSelectorText());
+//                selectable = page.getHtml().select(xpathSelector);
+//            } else if ("CssSelector".equals(urlPatterns.getSelectorName())) {
+//                CssSelector cssSelector = new CssSelector(urlPatterns.getSelectorText());
+//                selectable = page.getHtml().select(cssSelector);
+//            } else {
+//                selectable = page.getHtml();
+//            }
+//
+//            List<String> listFragments = new ArrayList<>();
+//            if (selectable.match()) {
+//                document.html(selectable.get());
+//                bodyText = document.text();
+//
+//                Elements allElements = document.children().first().children();
+//                for (Element element : allElements) {
+//                    String text = element.text();
+//                    if (StringUtils.isNotBlank(text)) {
+//                        listFragments.add(text.trim());
+//                    }
+//                }
+//            }
+//
+//            log.info("{}>>>>>>>>{}>>>>>>>>{}", page.getUrl(), listFragments.size(), bodyText);
+//
+//            if (StringUtils.isBlank(bodyText)) {
+//                page.setSkip(true);
+//            }
+//
+//
+//            if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
+//                description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
+//            }
+//
+//            if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
+//                keywords = description;
+//            }
+//
+//            page.putField("title", title);
+//            page.putField("bodyText", bodyText);
+//            page.putField("html", html);
+//            page.putField("keywords", keywords);
+//            page.putField("description", description);
+//            page.putField("listFragments", JSON.toJSONString(listFragments));
+//        }
+//
+//        @Override
+//        public Site getSite() {
+//            return this.site;
+//        }
+//    }
 }

+ 154 - 0
src/main/java/com/zhiqiyun/open/core/spiders/HtmlPageProcessor.java

@@ -0,0 +1,154 @@
+package com.zhiqiyun.open.core.spiders;
+
+import com.alibaba.fastjson.JSON;
+import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderSiteRule;
+import com.zhiqiyun.open.core.models.sentiment.UrlPatterns;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.CssSelector;
+import us.codecraft.webmagic.selector.Selectable;
+import us.codecraft.webmagic.selector.XpathSelector;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.regex.Pattern;
+
+@Slf4j
+public class HtmlPageProcessor implements PageProcessor {
+    private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
+
+    private final Site site;
+
+    private final SentimentSpiderSiteRule rule;
+
+
+    public HtmlPageProcessor(SentimentSpiderSiteRule rule) {
+        this.rule = rule;
+        this.site = Site.me();
+        this.site.setUseGzip(true);
+        this.site.setRetryTimes(3);
+        this.site.setSleepTime(1000);
+    }
+
+    @Override
+    public void process(Page page) {
+        List<String> listUrls = page.getHtml().links().all();
+
+        listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
+
+        List<UrlPatterns> listUrlPatterns = this.rule.getUrlPatterns();
+
+        AtomicReference<UrlPatterns> matchedUrlPattern = new AtomicReference<>(null);
+        if (listUrlPatterns != null) {
+            listUrls.removeIf(url -> {
+                log.info(url);
+                boolean hasMatched = false;
+                for (UrlPatterns urlPattern : listUrlPatterns) {
+                    hasMatched = Pattern.matches(urlPattern.getUrlPattern(), url);
+                    if (hasMatched) {
+                        break;
+                    }
+                }
+                return !hasMatched;
+            });
+
+            for (UrlPatterns urlPattern : listUrlPatterns) {
+                if (Pattern.matches(urlPattern.getUrlPattern(), page.getUrl().get())) {
+                    matchedUrlPattern.set(urlPattern);
+                }
+            }
+        }
+
+        LIST_SPIDER_URLS.addAll(listUrls);
+
+        page.addTargetRequests(listUrls);
+
+        Document document = page.getHtml().getDocument();
+
+        String title = document.title();
+        String bodyText = "";
+        String html = document.html();
+
+        String keywords = "";
+        String description = "";
+        Elements elements = document.getElementsByTag("meta");
+
+        if (elements != null && elements.size() > 0) {
+            for (Element element : elements) {
+                String metaName = element.attr("name");
+                String metaContent = element.attr("content");
+                if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
+                    keywords = metaContent;
+                }
+                if (StringUtils.equalsIgnoreCase("description", metaName)) {
+                    description = metaContent;
+                }
+            }
+        }
+        Selectable selectable;
+        UrlPatterns urlPatterns = null;
+        if (matchedUrlPattern.get() != null) {
+            urlPatterns = matchedUrlPattern.get();
+        }
+
+        if (urlPatterns == null) {
+            selectable = page.getHtml();
+        } else if ("XpathSelector".equals(urlPatterns.getSelectorName())) {
+            XpathSelector xpathSelector = new XpathSelector(urlPatterns.getSelectorText());
+            selectable = page.getHtml().select(xpathSelector);
+        } else if ("CssSelector".equals(urlPatterns.getSelectorName())) {
+            CssSelector cssSelector = new CssSelector(urlPatterns.getSelectorText());
+            selectable = page.getHtml().select(cssSelector);
+        } else {
+            selectable = page.getHtml();
+        }
+
+        List<String> listFragments = new ArrayList<>();
+        if (selectable.match()) {
+            document.html(selectable.get());
+            bodyText = document.text();
+
+            Elements allElements = document.children().first().children();
+            for (Element element : allElements) {
+                String text = element.text();
+                if (StringUtils.isNotBlank(text)) {
+                    listFragments.add(text.trim());
+                }
+            }
+        }
+
+        log.info("{}>>>>>>>>{}>>>>>>>>{}", page.getUrl(), listFragments.size(), bodyText);
+
+        if (StringUtils.isBlank(bodyText)) {
+            page.setSkip(true);
+        }
+
+
+        if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
+            description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
+        }
+
+        if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
+            keywords = description;
+        }
+
+        page.putField("title", title);
+        page.putField("bodyText", bodyText);
+        page.putField("html", html);
+        page.putField("keywords", keywords);
+        page.putField("description", description);
+        page.putField("listFragments", JSON.toJSONString(listFragments));
+    }
+
+    @Override
+    public Site getSite() {
+        return this.site;
+    }
+}

+ 0 - 153
src/main/java/com/zhiqiyun/open/core/spiders/OkHttpDownloader.java

@@ -1,153 +0,0 @@
-package com.zhiqiyun.open.core.spiders;
-
-import lombok.extern.slf4j.Slf4j;
-import okhttp3.OkHttpClient;
-import okhttp3.Response;
-import org.apache.commons.io.IOUtils;
-import org.apache.http.HttpResponse;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Component;
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Request;
-import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.downloader.Downloader;
-import us.codecraft.webmagic.selector.PlainText;
-import us.codecraft.webmagic.utils.CharsetUtils;
-import us.codecraft.webmagic.utils.HttpClientUtils;
-
-import java.io.IOException;
-import java.nio.charset.Charset;
-
-@Slf4j
-@Component("okhttp.downloader")
-public class OkHttpDownloader implements Downloader {
-
-    @Autowired
-    private OkHttpClient okHttpClient;
-
-    @Override
-    public Page download(Request request, Task task) {
-        Page page = new Page();
-        try {
-            log.info(request.getUrl());
-
-            okhttp3.Request.Builder builder = new okhttp3.Request.Builder();
-            builder.url(request.getUrl());
-            builder.get();
-
-            Response resp = this.okHttpClient.newCall(builder.build()).execute();
-
-            String rawText = resp.body().string();
-
-            page.setBytes(rawText.getBytes());
-            page.setRawText(rawText);
-            page.setUrl(new PlainText(request.getUrl()));
-            page.setRequest(request);
-            page.setStatusCode(resp.code());
-            page.setDownloadSuccess(true);
-            page.setHeaders(resp.headers().toMultimap());
-        } catch (Exception e) {
-            log.error("", e);
-        }
-        return page;
-    }
-
-    @Override
-    public void setThread(int i) {
-        log.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>{}", i);
-    }
-
-
-    protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
-        byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
-        String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
-        Page page = new Page();
-        page.setBytes(bytes);
-        if (charset == null) {
-            charset = this.getHtmlCharset(contentType, bytes);
-        }
-
-        page.setCharset(charset);
-        page.setRawText(new String(bytes, charset));
-
-        page.setUrl(new PlainText(request.getUrl()));
-        page.setRequest(request);
-        page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
-        page.setDownloadSuccess(true);
-        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
-
-
-        return page;
-    }
-
-    private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
-        String charset = CharsetUtils.detectCharset(contentType, contentBytes);
-        if (charset == null) {
-            charset = Charset.defaultCharset().name();
-            log.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
-        }
-
-        return charset;
-    }
-}
-/**
- * package util;
- * <p>
- * import java.io.BufferedReader;
- * import java.io.File;
- * import java.io.FileWriter;
- * import java.io.InputStream;
- * import java.io.InputStreamReader;
- * import java.io.PrintWriter;
- * <p>
- * import us.codecraft.webmagic.Page;
- * import us.codecraft.webmagic.Request;
- * import us.codecraft.webmagic.selector.PlainText;
- * <p>
- * public class GetAjaxHtml {
- * public static String getAjaxContent(String url) throws Exception {
- * Runtime rt = Runtime.getRuntime();
- * Process p = rt
- * .exec("D:/phantomjs-2.1.1-windows/bin/phantomjs.exe D:/s.js "
- * + url);
- * InputStream is = p.getInputStream();
- * BufferedReader br = new BufferedReader(new InputStreamReader(is));
- * StringBuffer sbf = new StringBuffer();
- * String tmp = "";
- * while ((tmp = br.readLine()) != null) {
- * sbf.append(tmp + "\n");
- * }
- * return sbf.toString();
- * }
- * <p>
- * public static Page download(Request request) {
- * Page page = new Page();
- * try {
- * String url = request.getUrl();
- * String html = getAjaxContent(url);
- * page.setRawText(html);
- * page.setUrl(new PlainText(url));
- * page.setRequest(request);
- * return page;
- * } catch (Exception e) {
- * System.out.println("download出错了!");
- * return page;
- * }
- * }
- * <p>
- * public static void main(String[] args) throws Exception {
- * long start = System.currentTimeMillis();
- * String result = getAjaxContent("http://www.taobao.com");
- * System.out.println(result);
- * // 创建新文件
- * String path = "D:\\testFile\\taobao.html";
- * PrintWriter printWriter = null;
- * printWriter = new PrintWriter(new FileWriter(new File(path)));
- * printWriter.write(result);
- * printWriter.close();
- * long end = System.currentTimeMillis();
- * System.out.println("===============耗时:" + (end - start)
- * + "===============");
- * }
- * }
- */

+ 87 - 0
src/main/java/com/zhiqiyun/open/core/spiders/SpiderEventPipeline.java

@@ -0,0 +1,87 @@
+package com.zhiqiyun.open.core.spiders;
+
+import com.alibaba.fastjson.JSON;
+import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderResult;
+import com.zhiqiyun.open.core.service.SentimentSpiderResultService;
+import com.zhiqiyun.open.utils.DateUtil;
+import org.apache.commons.codec.digest.DigestUtils;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.pipeline.Pipeline;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class SpiderEventPipeline implements Pipeline {
+
+    private final String[] keywords;
+    private final Long eventId;
+    private final Long siteRuleId;
+
+    private final SentimentSpiderResultService sentimentSpiderResultService;
+
+    public SpiderEventPipeline(String[] keywords, Long eventId, Long siteRuleId, SentimentSpiderResultService sentimentSpiderResultService) {
+        this.sentimentSpiderResultService = sentimentSpiderResultService;
+        this.keywords = keywords;
+        this.eventId = eventId;
+        this.siteRuleId = siteRuleId;
+    }
+
+    @Override
+    public void process(ResultItems resultItems, Task task) {
+        Map<String, Object> dataMap = resultItems.getAll();
+
+        String url = resultItems.getRequest().getUrl();
+        String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
+
+        String title = dataMap.getOrDefault("title", "").toString();
+        String keywords = dataMap.getOrDefault("keywords", "").toString();
+        String description = dataMap.getOrDefault("description", "").toString();
+        String bodyText = dataMap.getOrDefault("bodyText", "").toString();
+        String html = dataMap.getOrDefault("html", "").toString();
+        List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
+
+
+        boolean flag = true;
+        for (String keyword : this.keywords) {
+            if (!bodyText.contains(keyword)) {
+                flag = false;
+                break;
+            }
+        }
+
+
+        List<String> fragments = new ArrayList<>();
+        for (String s : listFragments) {
+            boolean isContains = false;
+            for (String keyword : this.keywords) {
+                if (s.contains(keyword)) {
+                    isContains = true;
+                    s = s.replace(keyword, "<em>" + keyword + "</em>");
+                }
+            }
+            if (isContains) {
+                fragments.add(s);
+            }
+        }
+
+
+        if (flag) {
+            SentimentSpiderResult result = new SentimentSpiderResult();
+            result.setId(id);
+            result.setEventId(eventId);
+            result.setSiteRuleId(siteRuleId);
+            result.setUrl(url);
+            result.setTitle(title);
+            result.setKeywords(keywords);
+            result.setDescription(description);
+            result.setFragments(fragments);
+            result.setBodyText(bodyText);
+            result.setHtml(html);
+            result.setSpiderTime(DateUtil.current());
+            result.setUpdateTime(DateUtil.current());
+            this.sentimentSpiderResultService.saveOrUpdate(result);
+        }
+    }
+}

+ 103 - 0
src/main/java/com/zhiqiyun/open/core/spiders/WeiboPageProcessor.java

@@ -0,0 +1,103 @@
+package com.zhiqiyun.open.core.spiders;
+
+import com.alibaba.fastjson.JSON;
+import com.alibaba.fastjson.JSONObject;
+import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderSiteRule;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.net.URL;
+import java.util.*;
+
+@Slf4j
+public class WeiboPageProcessor implements PageProcessor {
+
+    private final Site site;
+
+    private final SentimentSpiderSiteRule rule;
+
+
+    public WeiboPageProcessor(SentimentSpiderSiteRule rule) {
+        this.rule = rule;
+        this.site = Site.me();
+        this.site.setUseGzip(true);
+        this.site.setRetryTimes(3);
+        this.site.setSleepTime(1000);
+    }
+
+    @Override
+    public void process(Page page) {
+        try {
+            URL url = new URL(page.getUrl().get());
+            Map<String, String> queryMap = this.getMap(url.getQuery());
+            List<String> listQueries = new ArrayList<>();
+            for (String key : queryMap.keySet()) {
+                String value = queryMap.get(key);
+                if ("max_id".equals(key)) {
+                    int v = Integer.parseInt(value);
+                    value = String.valueOf(v + 1);
+                }
+                listQueries.add(String.format("%s=%s", key, value));
+            }
+            String queryString = StringUtils.join(listQueries, "&");
+
+            log.info("max_id>>>>>>>>>{}", queryMap.get("max_id"));
+
+            page.addTargetRequest(String.format("https://weibo.com/ajax/feed/hottimeline?%s", queryString));
+
+            String text = page.getRawText();
+            JSONObject jsonObject = JSON.parseObject(text);
+            int ok = jsonObject.getInteger("ok");
+            if (ok == 1) {
+                List<String> listFragments = new ArrayList<>();
+                List<JSONObject> list = jsonObject.getJSONArray("statuses").toJavaList(JSONObject.class);
+                List<String> listTitles = new ArrayList<>();
+                List<String> listIds = new ArrayList<>();
+                List<String> listCreatedAts = new ArrayList<>();
+                for (JSONObject o : list) {
+                    listCreatedAts.add(o.getString("created_at"));
+                    listIds.add(o.getString("id"));
+                    listTitles.add(o.getString("source"));
+                    JSONObject longText = o.getJSONObject("longText");
+                    if (longText != null) {
+                        listFragments.add(longText.getString("longTextContent"));
+                    } else {
+                        listFragments.add(o.getString("text_raw"));
+                    }
+
+                }
+
+
+                page.putField("listCreatedAts", JSON.toJSONString(listCreatedAts));
+                page.putField("listIds", JSON.toJSONString(listIds));
+                page.putField("listTitles", JSON.toJSONString(listTitles));
+                page.putField("listFragments", JSON.toJSONString(listFragments));
+            } else {
+                page.setSkip(true);
+            }
+        } catch (Exception e) {
+            log.error("", e);
+            page.setSkip(true);
+        }
+    }
+
+    @Override
+    public Site getSite() {
+        return this.site;
+    }
+
+    public Map<String, String> getMap(String urlparam) {
+        Map<String, String> map = new HashMap<>();
+        String[] param = urlparam.split("&");
+        for (String keyvalue : param) {
+            String[] pair = keyvalue.split("=");
+            if (pair.length == 2) {
+                map.put(pair[0], pair[1]);
+            }
+        }
+        return map;
+    }
+}

+ 100 - 0
src/main/java/com/zhiqiyun/open/core/spiders/WeiboPipeline.java

@@ -0,0 +1,100 @@
+package com.zhiqiyun.open.core.spiders;
+
+import com.alibaba.fastjson.JSON;
+import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderResult;
+import com.zhiqiyun.open.core.service.SentimentSpiderResultService;
+import com.zhiqiyun.open.utils.DateUtil;
+import lombok.extern.slf4j.Slf4j;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.pipeline.Pipeline;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+@Slf4j
+public class WeiboPipeline implements Pipeline {
+
+    private final String[] keywords;
+    private final Long eventId;
+    private final Long siteRuleId;
+
+    private final SentimentSpiderResultService sentimentSpiderResultService;
+
+    public WeiboPipeline(String[] keywords, Long eventId, Long siteRuleId, SentimentSpiderResultService sentimentSpiderResultService) {
+        this.sentimentSpiderResultService = sentimentSpiderResultService;
+        this.keywords = keywords;
+        this.eventId = eventId;
+        this.siteRuleId = siteRuleId;
+    }
+
+    @Override
+    public void process(ResultItems resultItems, Task task) {
+        Map<String, Object> dataMap = resultItems.getAll();
+
+        String url = resultItems.getRequest().getUrl();
+//        String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
+
+//        String title = dataMap.getOrDefault("title", "").toString();
+//        String keywords = dataMap.getOrDefault("keywords", "").toString();
+//        String description = dataMap.getOrDefault("description", "").toString();
+//        String bodyText = dataMap.getOrDefault("bodyText", "").toString();
+//        String html = dataMap.getOrDefault("html", "").toString();
+        List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
+        List<String> listTitles = JSON.parseArray(dataMap.getOrDefault("listTitles", "[]").toString(), String.class);
+        List<String> listIds = JSON.parseArray(dataMap.getOrDefault("listIds", "[]").toString(), String.class);
+        List<String> listCreatedAts = JSON.parseArray(dataMap.getOrDefault("listCreatedAts", "[]").toString(), String.class);
+
+
+//        boolean flag = true;
+//        for (String keyword : this.keywords) {
+//            if (!bodyText.contains(keyword)) {
+//                flag = false;
+//                break;
+//            }
+//        }
+
+
+        for (int index = 0; index < listFragments.size(); index++) {
+            try {
+                String fragment = listFragments.get(index);
+                String emFragment = listFragments.get(index);
+                String title = listTitles.get(index);
+                String id = listIds.get(index);
+                String createdAt = listCreatedAts.get(index);
+
+                boolean isContains = false;
+                for (String keyword : this.keywords) {
+                    if (fragment.contains(keyword)) {
+                        isContains = true;
+                        emFragment = emFragment.replace(keyword, "<em>" + keyword + "</em>");
+                    }
+                }
+
+                if (isContains) {
+                    SentimentSpiderResult result = new SentimentSpiderResult();
+                    result.setId(id);
+                    result.setEventId(eventId);
+                    result.setSiteRuleId(siteRuleId);
+                    result.setUrl(String.format("https://m.weibo.cn/detail/%s", id));
+                    result.setTitle(title);
+                    result.setKeywords(fragment);
+                    result.setDescription(fragment);
+                    result.setFragments(Collections.singletonList(emFragment));
+                    result.setBodyText(fragment);
+                    result.setHtml(fragment);
+                    result.setSpiderTime(DateUtil.current());
+
+                    result.setUpdateTime(DateUtil.current());
+
+                    this.sentimentSpiderResultService.saveOrUpdate(result);
+                }
+            } catch (Exception e) {
+                log.error("", e);
+            }
+        }
+
+
+    }
+}

+ 0 - 46
src/main/java/com/zhiqiyun/open/core/typeHandler/FastjsonTypeHandler.java

@@ -1,46 +0,0 @@
-package com.zhiqiyun.open.core.typeHandler;
-
-import com.alibaba.fastjson.JSON;
-import com.alibaba.fastjson.TypeReference;
-import lombok.extern.slf4j.Slf4j;
-import org.apache.ibatis.type.BaseTypeHandler;
-import org.apache.ibatis.type.JdbcType;
-import org.apache.ibatis.type.MappedJdbcTypes;
-import org.apache.ibatis.type.MappedTypes;
-
-import java.sql.CallableStatement;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-@Slf4j
-@MappedTypes({Object.class})
-@MappedJdbcTypes({JdbcType.VARCHAR})
-public class FastjsonTypeHandler<E> extends BaseTypeHandler<E> {
-
-    private E parse(String jsonValue) {
-        return jsonValue != null && jsonValue.length() != 0 ? JSON.parseObject(jsonValue, new TypeReference<E>() {
-        }) : null;
-    }
-
-    @Override
-    public void setNonNullParameter(PreparedStatement ps, int i, E parameter, JdbcType jdbcType) throws SQLException {
-        ps.setString(i, JSON.toJSONString(parameter));
-    }
-
-
-    @Override
-    public E getNullableResult(ResultSet rs, String columnName) throws SQLException {
-        return this.parse(rs.getString(columnName));
-    }
-
-    @Override
-    public E getNullableResult(ResultSet rs, int i) throws SQLException {
-        return this.parse(rs.getString(i));
-    }
-
-    @Override
-    public E getNullableResult(CallableStatement cs, int i) throws SQLException {
-        return this.parse(cs.getString(i));
-    }
-}

+ 19 - 0
src/main/java/com/zhiqiyun/open/core/typeHandler/UrlPatternTypeHandler.java

@@ -0,0 +1,19 @@
+package com.zhiqiyun.open.core.typeHandler;
+
+import com.alibaba.fastjson.JSON;
+import com.alibaba.fastjson.serializer.SerializerFeature;
+import com.baomidou.mybatisplus.extension.handlers.AbstractJsonTypeHandler;
+import com.zhiqiyun.open.core.models.sentiment.UrlPatterns;
+
+import java.util.List;
+
+public class UrlPatternTypeHandler extends AbstractJsonTypeHandler<List<UrlPatterns>> {
+    protected List<UrlPatterns> parse(String json) {
+        return JSON.parseArray(json, UrlPatterns.class);
+    }
+
+    protected String toJson(List<UrlPatterns> list) {
+        return JSON.toJSONString(list, SerializerFeature.WriteMapNullValue, SerializerFeature.WriteNullListAsEmpty, SerializerFeature.WriteNullStringAsEmpty);
+    }
+}
+

+ 9 - 5
src/main/java/com/zhiqiyun/open/mvc/params/sentiment/SaveSpiderSiteRuleParam.java

@@ -1,12 +1,16 @@
 package com.zhiqiyun.open.mvc.params.sentiment;
 
+import com.zhiqiyun.open.core.models.sentiment.UrlPatterns;
 import lombok.Data;
 
+import java.util.List;
+
 @Data
 public class SaveSpiderSiteRuleParam {
-	private Long id;
-	private String siteName;
-	private String urlPatterns;
-	private String startUrls;
-	private String xpath;
+    private Long id;
+    private String siteName;
+    private String collectorName;
+    private List<UrlPatterns> urlPatterns;
+    private String startUrls;
+    private String xpath;
 }

+ 2 - 0
src/main/resources/application-prod.properties

@@ -14,3 +14,5 @@ uploader.max-size=20480000
 uploader.file-host=http://47.114.32.188:9800/src
 uploader.allow-file-types=jpg,jpeg,png,gif
 uploader.save-path=/data/uploads
+####################### spider config ###############################
+spider.phantomjs_executable_path_property=/data/phantomjs-2.1.1-linux-x86_64/bin/phantomjs

+ 21 - 0
src/main/resources/db/migration/V1.1.3__update_sentiment_spider.sql

@@ -0,0 +1,21 @@
+DROP TABLE IF EXISTS `sentiment_spider_site_rule`;
+CREATE TABLE `sentiment_spider_site_rule`
+(
+    `id`             BIGINT(20) NOT NULL COMMENT 'ID',
+    `site_name`      VARCHAR(50) NOT NULL COMMENT '网站名称',
+    `collector_name` VARCHAR(50) NOT NULL COMMENT '采集器名称',
+    `url_patterns`   text        NOT NULL COMMENT '匹配URL地址',
+    `start_urls`     text        NOT NULL COMMENT '启动地址',
+    `created_time`   DATETIME    NOT NULL COMMENT '创建时间',
+    `created_by`     BIGINT(20) NOT NULL COMMENT '创建人',
+    `updated_time`   DATETIME NULL DEFAULT NULL COMMENT '修改时间',
+    `updated_by`     BIGINT(20) NULL DEFAULT NULL COMMENT '修改人',
+    PRIMARY KEY (`id`)
+) COMMENT ='采集网站规则' ENGINE = InnoDB;
+
+delete
+from sentiment_spider_event
+where id is not null;
+delete
+from sentiment_spider_result
+where id is not null;

+ 49 - 61
src/test/java/com/zhiqiyun/SimplePageProcessor.java

@@ -1,76 +1,64 @@
 package com.zhiqiyun;
 
-import com.alibaba.fastjson.JSON;
 import com.zhiqiyun.open.core.models.sentiment.SentimentSpiderSiteRule;
-import com.zhiqiyun.open.core.service.impl.SentimentSpiderEventServiceImpl;
-import com.zhiqiyun.open.core.spiders.OkHttpDownloader;
 import lombok.extern.slf4j.Slf4j;
-import okhttp3.OkHttpClient;
-import org.apache.commons.codec.digest.DigestUtils;
-import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.pipeline.Pipeline;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.TimeUnit;
 
 @Slf4j
 public class SimplePageProcessor {
     public static void main(String[] args) {
         SentimentSpiderSiteRule rule = new SentimentSpiderSiteRule();
         rule.setStartUrls("https://www.188420.com/");
-        rule.setUrlPatterns("https://www.188420.com/a/([0-9]+).html");
-        rule.setXpath("/html/body/div[3]/div[1]/div[1]");
+//        rule.setUrlPatterns("https://www.188420.com/a/([0-9]+).html");
+//        rule.setXpath("/html/body/div[3]/div[1]/div[1]");
 
         String[] startUrlArray = rule.getStartUrls().split("\n");
 
-        Spider spider = Spider.create(new SentimentSpiderEventServiceImpl.SpiderEventProcessor(rule));
-
-
-        List<Pipeline> pipelines = new ArrayList<>();
-        pipelines.add((resultItems, task) -> {
-            Map<String, Object> dataMap = resultItems.getAll();
-
-            String url = resultItems.getRequest().getUrl();
-            String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
-
-            String title = dataMap.getOrDefault("title", "").toString();
-            String keywords = dataMap.getOrDefault("keywords", "").toString();
-            String description = dataMap.getOrDefault("description", "").toString();
-            String bodyText = dataMap.getOrDefault("bodyText", "").toString();
-            String html = dataMap.getOrDefault("html", "").toString();
-            List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
-
-            String[] listKeywords = new String[]{"十三届五次会", "习近平"};
-
-            boolean flag = true;
-            for (String keyword : listKeywords) {
-                if (!bodyText.contains(keyword)) {
-                    flag = false;
-                    break;
-                }
-            }
-
-            listFragments.removeIf(o -> {
-                boolean hasContains = false;
-                for (String keyword : listKeywords) {
-                    if (o.contains(keyword)) {
-                        hasContains = true;
-                        break;
-                    }
-                }
-                return !hasContains;
-            });
-
-            log.info("{}>>>>>>>>{}>>>>>>>>{}>>>>>>>>{}", url, flag, listFragments.size(), bodyText);
-        });
-        spider.setPipelines(pipelines);
-        spider.setExitWhenComplete(true);
-        spider.addUrl(startUrlArray);
-        OkHttpClient.Builder builder = new OkHttpClient.Builder();
-        builder.connectTimeout(30, TimeUnit.SECONDS);
-        spider.setDownloader(new OkHttpDownloader());
-        spider.runAsync();
+//        Spider spider = Spider.create(new SentimentSpiderEventServiceImpl.SpiderEventProcessor(rule));
+//
+//
+//        List<Pipeline> pipelines = new ArrayList<>();
+//        pipelines.add((resultItems, task) -> {
+//            Map<String, Object> dataMap = resultItems.getAll();
+//
+//            String url = resultItems.getRequest().getUrl();
+//            String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
+//
+//            String title = dataMap.getOrDefault("title", "").toString();
+//            String keywords = dataMap.getOrDefault("keywords", "").toString();
+//            String description = dataMap.getOrDefault("description", "").toString();
+//            String bodyText = dataMap.getOrDefault("bodyText", "").toString();
+//            String html = dataMap.getOrDefault("html", "").toString();
+//            List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
+//
+//            String[] listKeywords = new String[]{"十三届五次会", "习近平"};
+//
+//            boolean flag = true;
+//            for (String keyword : listKeywords) {
+//                if (!bodyText.contains(keyword)) {
+//                    flag = false;
+//                    break;
+//                }
+//            }
+//
+//            listFragments.removeIf(o -> {
+//                boolean hasContains = false;
+//                for (String keyword : listKeywords) {
+//                    if (o.contains(keyword)) {
+//                        hasContains = true;
+//                        break;
+//                    }
+//                }
+//                return !hasContains;
+//            });
+//
+//            log.info("{}>>>>>>>>{}>>>>>>>>{}>>>>>>>>{}", url, flag, listFragments.size(), bodyText);
+//        });
+//        spider.setPipelines(pipelines);
+//        spider.setExitWhenComplete(true);
+//        spider.addUrl(startUrlArray);
+//        OkHttpClient.Builder builder = new OkHttpClient.Builder();
+//        builder.connectTimeout(30, TimeUnit.SECONDS);
+//        spider.setDownloader(new OkHttpDownloader());
+//        spider.runAsync();
     }
 }