groovy可以以脚本方式运行,所以在jvm环境下,groovy爬虫运行更加方便。另外,想要爬取网页的内容,重点需要经过以下几个步骤:
获取网页
获取网页的内容,可以直接使用Jsoup
,考虑到网络原因,可使用以下groovy示例代码:
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
int tryTimes = 0
Document doc = null
while (doc == null) {
tryTimes += 1
doc = Jsoup.connect(url).get()
if (doc == null) {
Thread.sleep(2000)
println "======== has try ${tryTimes} times =========="
}
if (tryTimes > 6) {
println "======== has try ${tryTimes} times and no more try =========="
return
}
}
即使用Jsoup.connect(url).get()
获取网页的html文档,成为一个 Document 对象。
解析网页内容
Document 对象就是一个html网页的内容,需要我们从中找到需要的信息。一般使用dom元素的css类、标签名称等,获取有用的 Element 对象,再通过对象,获取属性、内容等。groovy 代码如下:
// div
Elements elements = doc.getElementsByClass("tableListMain")
// tbody
Element tbody = elements.get(0).getElementsByTag("tbody").get(0)
// tables
Elements eleList = tbody.children()
for (Element tr : eleList.findAll()) {
try {
Elements tds = tr.getElementsByTag("td")
// a
String a = tds.get(0).getElementsByTag("a").get(0)
String type = tds.get(1)
String country = tds.get(2)
String time = tds.get(4)
String actors = tds.get(5).getElementsByTag("div").get(0)
println a
writer.write("\n")
writer.write(a.replace("\n", " "))
writer.write(type.replace("\n", " "))
writer.write(country.replace("\n", " "))
writer.write(time.replace("\n", " "))
writer.write(actors.replace("\n", " "))
} catch (Exception ex) {
ex.printStackTrace()
}
}
存储信息
存储解析出来的信息,groovy中操作数据库非常简单,实例化Sql对象,通过Groovy sql 对象,可以很简单地存储数据。groovy代码如下:
import groovy.sql.Sql
/**
*
* @author yawn
* @date 2021/1/19 星期二
*
*/
// sql db
url = "jdbc:mysql://localhost:3306/test?serverTimezone=UTC&characterEncoding=UTF-8&useUnicode=true"
sql = Sql.newInstance(url, "root", "root", "com.mysql.cj.jdbc.Driver")
// file
FileReader fr = new FileReader("dytt.com.txt")
BufferedReader br = new BufferedReader(fr)
String line
while ((line = br.readLine()) != null) {
if ("" == line.trim()) {
continue
}
def name = getName(line)
def href = getHref(line)
def type = getType(line)
def country = getCountry(line)
def time = getTime(line)
def actors = getActors(line)
if (name == null || name == "") {
continue
}
def insertSql = 'INSERT INTO movie_dytt (name, href, type, country, time, actors) VALUES (?, ?, ?, ?, ?, ?)'
def params = [name, href, type, country, time, actors]
def keys = sql.executeInsert(insertSql, params)
println keys
println name
}
sql.close()
如果是需要存储到搜索服务器elasticsearch或者meilisearch等搜索服务器上,可以使用以下代码:
import com.google.gson.Gson
import groovy.sql.GroovyRowResult
import groovy.sql.Sql
import groovyx.net.http.ContentType
import groovyx.net.http.HTTPBuilder
import groovyx.net.http.Method
/**
*
* @author yawn
* @date 2021/1/19 星期二
*
*/
// sql db
url = "jdbc:mysql://localhost:3306/test?serverTimezone=UTC&characterEncoding=UTF-8&useUnicode=true"
sql = Sql.newInstance(url, "root", "root", "com.mysql.cj.jdbc.Driver")
List<GroovyRowResult> list = sql.rows("select (20000 + id) id, CONCAT('https://www.dytt.com', href) href, name, type, country, time, actors from movie_dytt limit 100")
int offset = 100
while (list != null && list.size() >= 1) {
String listStr = new Gson().toJson(list)
def mapList = convert(list)
println(mapList)
def http = new HTTPBuilder('http://xxx.cn:7700')
http.post(path:'/indexes/movies/documents', body:mapList, requestContentType:ContentType.JSON){resp->
println resp
}
// http.request(Method.POST) {
// uri.path = '/indexes/movies/documents'
// body = list
// requestContentType = ContentType.JSON
// response.success = { resp->
// println resp
// }
// }
offset += 100
list = sql.rows("select (20000 + id) id, CONCAT('https://www.dytt.com', href) href, name, type, country, time, actors from movie_dytt limit ?, ?", [offset, 100])
println offset
}
sql.close()
def convert(List<GroovyRowResult> list) {
// List<Map> newList = new ArrayList<>()
// for (GroovyRowResult row : list) {
// def map = [id: row.get("id"), href: row.get("href").toString(), name: row.get("name").toString(), title: row.get("title").toString(), time: row.get("time").toString(), des: row.get("des").toString()]
// newList.add(map)
// }
// return newList
return list
}
以上代码就是从数据库分页读取数据,再写入带meilisearch的groovy代码,使用sql对象读取数据库数据,再使用HTTPBuilder将数据通过http请求写入到meilisearch数据库。