网上很少有用Kotlin写的爬虫,其实Kotlin可以直接调用java的库,写爬虫真是又快又好

在bulid.gradle添加依赖库【JSOUP库】

implementation 'org.jsoup:jsoup:1.12.2'

电影网站爬虫:

import org.jsoup.Jsoup
import java.io.File
import java.net.URL
import java.nio.charset.Charset

fun main() {
    var file = File("movies.txt")
    file.writeText("")
        for (m in 1..201) {
            //使用gbk字符集,抓取电影天堂网页源码
            var html = URL("https://ygdy8.net/html/gndy/dyzz/list_23_$m.html").readText(Charset.forName("gbk"))
            //使用jsoup库
            var doc = Jsoup.parse(html)
            var movies = doc.getElementsByTag("b")
            for (i in movies) {
                var name = i.text()
                var url = "https://ygdy8.net" + i.select("a").attr("href")

                var html2 = URL(url).readText(Charset.forName("gbk"))
                var doc2 = Jsoup.parse(html2)

                var downloadurl = doc2.getElementById("Zoom").getElementsByTag("tr").text()

                println( "电影名:" + name)
                println( "详细链接:" + url)
                println( "下载链接:" + downloadurl)
                println("")
                file.appendText("$name\n$url\n$downloadurl\n\n")


            }
        }
}

小说网站爬虫

import org.jsoup.Jsoup
import java.io.File

fun main() {
    //直接抓取容易乱码,用的jsoup库的抓取方式
    val doc =  Jsoup.connect("http://www.xbiquge.la/10/10489/").get()
    //小说名称
    val name = doc.head().select("meta[property=og:title]").attr("content")
    //建立小说txt文件
    var novelFile =File("$name.txt")
    novelFile.writeText("小说名:$name\n\n")
    println(name)
    //获取每一章的网页
    val main = doc.getElementsByTag("dd")
    val chapters = main.select("a")
    //伪装UA池
    val UAs = listOf<String>(
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        "Opera/8.0 (Windows NT 5.1; U; en)",
        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
    )
    var count1 = 1
    var count2 = 1
    for (i in chapters) {
        var chapterName = i.text()
        var chapterUrl = "http://www.xbiquge.la" + i.attr("href")
        var chapterContent = Jsoup.connect(chapterUrl).userAgent(UAs.random()).get().getElementById("content").text()
            .replace(" 手机站全新改版升级地址:http://m.xbiquge.la,数据和书签与电脑站同步,无广告清新阅读!", "")
        novelFile.appendText("$chapterName\n")
        novelFile.appendText("$chapterContent\n\n")
        println(chapterName)
        //休息一会儿,避免服务器崩了(503错误)
            Thread.sleep(500)
        if (count1 >= 30){
            count1 = 1
            Thread.sleep(5000)
        }
        if (count2 >= 100){
            count1 = 1
            Thread.sleep(5000)
        }
        count1++
        count2++
    }



}