require 'win32ole' #win32ole库
require 'dbi' #数据库
require 'rexml/document'#xm库
require 'iconv'#字符集转换
include REXML
# utf8 to gbk
def u2g(ucode)
begin
"#{Iconv.conv('gb2312','utf-8',ucode)}"
rescue
" #{ucode} " #如果转换不成功 则不转换 并在字串两边加入空格 避免构造出错误的sql字符串
end
end
site = 'shooter.cn' #采集的目标网站
req = Net::HTTP.new(site,80) #创建请求对象
start_id = 1 #起始id
end_id = 9000 #结束id
#数据库执行句柄 db handler
dbh=DBI.connect("DBI:ADO:Provider=SQLOLEDB.1;Data Source=192.168.0.10;Initial Catalog=shooter;User Id=sa;Password=asdf*123;")
start_id.upto(end_id){|idx|
begin
resp, body = req.get("/static/sub/detail/#{idx}.xml") #获取页面内容
rescue
puts "retry #{idx}"
redo #出现访问url超时的错误 重试
end
begin
fileInfo = Hash.new #创建文件信息
xmldoc = Document.new body #载入xml文档
xmldoc.root.each_element{|ele| #解析每一个xml元素
case ele.name #判断该元素的名称
when 'id','fileid','imdbid','splito','rlsiteid','voteid','threadid','deleted','moderid'
fileInfo[ele.name] = u2g(ele.text).delete "'""" #该元素内有一个text node 设置为该key的value
else
fileInfo[ele.name] = u2g(ele.cdatas[0].value).delete "'""" #该元素内有一个 CData 设置为该key的value
end
}
rescue
next #出现xml没办法解析的错误 直接跳过
end
begin
strsql = "insert into tb_files (fileid,orgname,engname,akaname,chname,twname,hkname,othname,intro,format,[language],splito,producer,verifier,source,delreason)
values ('#{fileInfo['fileid']}','#{fileInfo['orgname']}','#{fileInfo['engname']}','#{fileInfo['akaname']}','#{fileInfo['chname']}','#{fileInfo['twname']}','#{fileInfo['hkname']}',
'#{fileInfo['othname']}','#{fileInfo['intro']}','#{fileInfo['format']}','#{fileInfo['language']}','#{fileInfo['splito']}','#{fileInfo['producer']}','#{fileInfo['verifier']}','#{fileInfo['source']}','#{fileInfo['delreason']}')"
dbh.do strsql #加一个任务
rows = dbh.commit #提交任务
if rows == 0 then
puts "插入失败:#{fileInfo['fileid']} #{fileInfo["orgname"]}"
else
puts "插入成功:#{fileInfo['fileid']} #{fileInfo["orgname"]}"
end
rescue
puts strsql #出现数据插入错误 打印sql字符串
end
}
dbh.disconnect #关闭连接