package com.alag.ci.webcrawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;

/* loaded from: input_file:com/alag/ci/webcrawler/NaiveCrawler.class */
public class NaiveCrawler {
    private static final String USER_AGENT = "User-agent:";
    private static final String DISALLOW = "Disallow:";
    public static final String REGEXP_HTTP = "<a href=\"http://(.)*\">";
    public static final String REGEXP_RELATIVE = "<a href=\"(.)*\">";
    private int maxNumberUrls;
    private long delayBetweenUrls;
    private int maxDepth;
    private Pattern regexpSearchPattern;
    private Map<String, CrawlerUrl> visitedUrls;
    private Map<String, Collection<String>> sitePermissions;
    private Queue<CrawlerUrl> urlQueue;
    private List<CrawlerUrl> crawlOutput;
    private int numberItemsSaved = 0;
    private Pattern httpRegexp = Pattern.compile(REGEXP_HTTP);
    private Pattern relativeRegexp = Pattern.compile(REGEXP_RELATIVE);

    public NaiveCrawler(Queue<CrawlerUrl> queue, int i, int i2, long j, String str) throws Exception {
        this.visitedUrls = null;
        this.sitePermissions = null;
        this.urlQueue = null;
        this.crawlOutput = null;
        this.urlQueue = queue;
        this.maxNumberUrls = i;
        this.delayBetweenUrls = j;
        this.maxDepth = i2;
        this.regexpSearchPattern = Pattern.compile(str);
        this.visitedUrls = new HashMap();
        this.sitePermissions = new HashMap();
        this.crawlOutput = new ArrayList();
    }

    public List<CrawlerUrl> crawl() throws Exception {
        while (continueCrawling()) {
            CrawlerUrl nextUrl = getNextUrl();
            if (nextUrl != null) {
                printCrawlInfo();
                String content = getContent(nextUrl);
                if (isContentRelevant(content, this.regexpSearchPattern)) {
                    saveContent(nextUrl, content);
                    addUrlsToUrlQueue(nextUrl, extractUrls(content, nextUrl));
                } else {
                    System.out.println(nextUrl + " is not relevant ignoring ...");
                }
                Thread.sleep(this.delayBetweenUrls);
            }
        }
        return this.crawlOutput;
    }

    private boolean continueCrawling() {
        return !this.urlQueue.isEmpty() && getNumberOfUrlsVisited() < this.maxNumberUrls;
    }

    private CrawlerUrl getNextUrl() {
        CrawlerUrl crawlerUrl = null;
        while (crawlerUrl == null && !this.urlQueue.isEmpty()) {
            CrawlerUrl remove = this.urlQueue.remove();
            if (doWeHavePermissionToVisit(remove) && !isUrlAlreadyVisited(remove) && isDepthAcceptable(remove)) {
                crawlerUrl = remove;
            }
        }
        return crawlerUrl;
    }

    private void printCrawlInfo() throws Exception {
        StringBuilder sb = new StringBuilder();
        sb.append("Queue length = ").append(this.urlQueue.size()).append(" visited urls=").append(getNumberOfUrlsVisited()).append(" site permissions=").append(this.sitePermissions.size());
        System.out.println(sb.toString());
    }

    private int getNumberOfUrlsVisited() {
        return this.visitedUrls.size();
    }

    private boolean isDepthAcceptable(CrawlerUrl crawlerUrl) {
        return crawlerUrl.getDepth() <= this.maxDepth;
    }

    private boolean isUrlAlreadyVisited(CrawlerUrl crawlerUrl) {
        return crawlerUrl.isVisited() || this.visitedUrls.containsKey(crawlerUrl.getUrlString());
    }

    public boolean doWeHavePermissionToVisit(CrawlerUrl crawlerUrl) {
        if (crawlerUrl == null) {
            return false;
        }
        if (!crawlerUrl.isCheckedForPermission()) {
            crawlerUrl.setAllowedToVisit(computePermissionForVisiting(crawlerUrl));
        }
        return crawlerUrl.isAllowedToVisit();
    }

    private boolean computePermissionForVisiting(CrawlerUrl crawlerUrl) {
        URL url = crawlerUrl.getURL();
        boolean z = url != null;
        if (z) {
            String host = url.getHost();
            Collection<String> collection = this.sitePermissions.get(host);
            if (collection == null) {
                collection = parseRobotsTxtFileToGetDisallowedPaths(host);
            }
            String path = url.getPath();
            Iterator<String> it = collection.iterator();
            while (it.hasNext()) {
                if (path.contains(it.next())) {
                    z = false;
                }
            }
        }
        return z;
    }

    private Collection<String> parseRobotsTxtFileToGetDisallowedPaths(String str) {
        String content = getContent("http://" + str + "/robots.txt");
        ArrayList arrayList = new ArrayList();
        if (content != null) {
            String str2 = "";
            for (String str3 : Pattern.compile(USER_AGENT).split(content)) {
                if (str3.trim().startsWith("*")) {
                    str2 = str3.substring(1);
                }
            }
            for (String str4 : Pattern.compile(DISALLOW).split(str2)) {
                arrayList.add(str4.trim());
            }
        }
        this.sitePermissions.put(str, arrayList);
        return arrayList;
    }

    private String getContent(String str) {
        return getContent(new CrawlerUrl(str, 0));
    }

    private String getContent(CrawlerUrl crawlerUrl) {
        HttpClient httpClient = new HttpClient();
        GetMethod getMethod = new GetMethod(crawlerUrl.getUrlString());
        getMethod.getParams().setParameter("http.method.retry-handler", new DefaultHttpMethodRetryHandler(3, false));
        String str = null;
        try {
            if (httpClient.executeMethod(getMethod) == 200) {
                str = readContentsFromStream(new InputStreamReader(getMethod.getResponseBodyAsStream(), getMethod.getResponseCharSet()));
            }
        } catch (Throwable th) {
            System.out.println(th.toString());
            th.printStackTrace();
        } finally {
            getMethod.releaseConnection();
        }
        markUrlAsVisited(crawlerUrl);
        return str;
    }

    private static String readContentsFromStream(Reader reader) throws IOException {
        BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader : new BufferedReader(reader);
        StringBuilder sb = new StringBuilder();
        char[] cArr = new char[4096];
        while (true) {
            int read = bufferedReader.read(cArr);
            if (read == -1) {
                return sb.toString();
            }
            sb.append(cArr, 0, read);
        }
    }

    private void markUrlAsVisited(CrawlerUrl crawlerUrl) {
        this.visitedUrls.put(crawlerUrl.getUrlString(), crawlerUrl);
        crawlerUrl.setIsVisited();
    }

    public List<String> extractUrls(String str, CrawlerUrl crawlerUrl) {
        HashMap hashMap = new HashMap();
        extractHttpUrls(hashMap, str);
        extractRelativeUrls(hashMap, str, crawlerUrl);
        return new ArrayList(hashMap.keySet());
    }

    private void extractHttpUrls(Map<String, String> map, String str) {
        Matcher matcher = this.httpRegexp.matcher(str);
        while (matcher.find()) {
            for (String str2 : matcher.group().split("a href=\"")) {
                if (str2.startsWith("http")) {
                    int indexOf = str2.indexOf("\"");
                    if (indexOf > 0) {
                        str2 = str2.substring(0, indexOf);
                    }
                    map.put(str2, str2);
                }
            }
        }
    }

    private void extractRelativeUrls(Map<String, String> map, String str, CrawlerUrl crawlerUrl) {
        Matcher matcher = this.relativeRegexp.matcher(str);
        String host = crawlerUrl.getURL().getHost();
        while (matcher.find()) {
            for (String str2 : matcher.group().split("a href=\"")) {
                if (str2.startsWith("/")) {
                    int indexOf = str2.indexOf("\"");
                    if (indexOf > 0) {
                        str2 = str2.substring(0, indexOf);
                    }
                    String str3 = "http://" + host + str2;
                    map.put(str3, str3);
                }
            }
        }
    }

    private void addUrlsToUrlQueue(CrawlerUrl crawlerUrl, Collection<String> collection) {
        int depth = crawlerUrl.getDepth() + 1;
        for (String str : collection) {
            if (!this.visitedUrls.containsKey(str)) {
                this.urlQueue.add(new CrawlerUrl(str, depth));
            }
        }
    }

    public static boolean isContentRelevant(String str, Pattern pattern) {
        boolean z = false;
        if (str != null) {
            z = pattern.matcher(str.toLowerCase()).find();
        }
        return z;
    }

    private void saveContent(CrawlerUrl crawlerUrl, String str) throws Exception {
        crawlerUrl.setTitle(extractTitle(str));
        this.crawlOutput.add(crawlerUrl);
        this.numberItemsSaved++;
    }

    private static String extractTitle(String str) {
        Matcher matcher = Pattern.compile("<title>.*</title>").matcher(str);
        return matcher.find() ? matcher.group(0).replaceAll("</?(?i:title)>", "") : "Sem titulo";
    }
}
