package cn.edu.hfut.dmic.webcollector.parser;

import cn.edu.hfut.dmic.webcollector.model.Link;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.util.CharsetDetector;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/* loaded from: input_file:cn/edu/hfut/dmic/webcollector/parser/HtmlParser.class */
public class HtmlParser implements Parser {
    private Integer topN;

    public HtmlParser() {
        this.topN = null;
    }

    public HtmlParser(Integer num) {
        this.topN = num;
    }

    @Override // cn.edu.hfut.dmic.webcollector.parser.Parser
    public ParseResult getParse(Page page) throws UnsupportedEncodingException {
        String url = page.getUrl();
        page.setHtml(new String(page.getContent(), CharsetDetector.guessEncoding(page.getContent())));
        Document parse = Jsoup.parse(page.getHtml());
        parse.setBaseUri(url);
        page.setDoc(parse);
        return new ParseResult(new ParseData(url, parse.title(), topNFilter(LinkUtils.getAll(page))), new ParseText(url, parse.text()));
    }

    private ArrayList<Link> topNFilter(ArrayList<Link> arrayList) {
        ArrayList<Link> arrayList2 = new ArrayList<>();
        int size = this.topN == null ? arrayList.size() : Math.min(this.topN.intValue(), arrayList.size());
        int i = 0;
        for (int i2 = 0; i2 < arrayList.size() && i < size; i2++) {
            arrayList2.add(arrayList.get(i2));
            i++;
        }
        return arrayList2;
    }
}
