如下参考:这里
#!/usr/bin/ruby -Ku
# source must be written in UTF-8
$KCODE = 'u'
require 'rubygems'
require 'iconv'
# Patching Mechanize...
require 'mechanize'
# Perform iconv() before Mechanize parses the page
class IConvParser < WWW::Mechanize::Page
def initialize(uri = nil, response = nil, body = nil, code = nil)
new_body = Iconv.conv("UTF-8//IGNORE", "GB2312//IGNORE", body)
super(uri, response, new_body, code)
end
end
# Set IConvParser as default HTML parser, you don't have to
# run iconv() to each page manually.
class WWW::Mechanize::PluggableParser
def initialize
@parsers = { CONTENT_TYPES[:html] => IConvParser }
@default = File
end
end
# Scrubyt::FetchAction initializes its @@agent as PluggableParser.new
# at class context, so we manually require 'scrubyt'
# AFTER the above patch was installed.
require 'scrubyt'
# Now definition is quite simple
baidu_data = Scrubyt::Extractor.define do
fetch "http://www.baidu.com/"
fill_textfield 'wd',"ruby"
submit
result "Ruby_百度百科"
end
puts baidu_data.to_xml
mysql,Linux,HighPerformance,ruby on Rails
2009年4月19日星期日
Scrubyt抓取非utf-8的代码Sample
订阅:
博文评论 (Atom)
没有评论:
发表评论