但行好事,莫问前程

挖了太多坑,一点点填回来

曾经写过的Ruby脚本

crawler, ruby, 人人, 相册, 豆瓣

以前为了爬豆瓣和人人的美女图片,自己动手用Ruby写过一个爬虫。今天再来看看,几乎都看不懂了,晚上趁着休息时间把以前的代码温习温习,加上了注释,不过这代码已经证实在我的Ubuntu上是跑不动了。唉,开源软件版本更新得太快,以前的老代码放到现在都不能用了,真担心有一天自己也被淘汰,非常害怕啊。

代码很长,偶尔看一下,觉得自己还是能踏踏实实做一个苦逼的码农的,只是有时候人在江湖身不由己,你干的事情不一样是你自己喜欢干的。

(album_download.rb) download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/ruby
require "rubygems"
require "nokogiri"
require "open-uri"
require "thread"
require "iconv"
require "rbconfig"
require File.expand_path("../album", __FILE__)

# Win环境?
def on_windows?
  if Config::CONFIG["host_os"] == "mingw32"
    true
  else
    false
  end
end

# 任务类
class AlbumDownload
  def initialize
    begin
      get_cmd_params
    rescue ArgumentError => e
      puts e.class.to_s, e.message, e.backtrace
    end
    @verified_urls = []
  end

  def go
    begin
      verify_urls
      @verified_urls.each do |url|
        Album.factory(url).download
      end
    rescue InvalidUrlError => e
      puts e.class.to_s, e.message, e.backtrace
      exit
    end
  end

  def download

  end

  protected

  def get_cmd_params
    # 没有命令行参数,抛出异常
    raise ArgumentError, "Please input an album url!" if ARGV.empty?
    @args = ARGV
  end

  def verify_urls
    # 如果是有效链接,放入@verify_urls变量中
    # 命令行可以接收多个URL,但是实际用起来还是每次只输入一个就好。
    @args.each do |arg|
      @verified_urls << arg if valid_album?(arg)
    end
    raise InvalidUrlError, "Please input an valid album url!" if @verified_urls.empty?
  end

  def valid_album?(url)
    # 如果是豆瓣相册链接或者是人人相册链接
    douban_album?(url) || renren_album?(url)
  end

  def douban_album?(url)
    # 豆瓣相册URL正则验证
    DOUBAN_PUBLIC_ALBUM_URL_EXP =~ url
  end

  def renren_album?(url)
    # 人人相册URL正则验证
    RENREN_PUBLIC_ALBUM_URL_EXP =~ url
  end

end

if __FILE__ == $0
  AlbumDownload.new.go
end
(album.rb) download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
DOUBAN_PUBLIC_ALBUM_URL_EXP = /http:\/\/www\.douban\.com\/photos\/album\/\d+\//
RENREN_PUBLIC_ALBUM_URL_EXP = /http:\/\/page\.renren\.com\/\w+\/album\/\d+/

class InvalidPicError < Exception
end

class InvalidUrlError < Exception
end

class InvalidAlbumError < Exception
end

class Album
  def initialize(url)
    @current_url = url
    @doc = Nokogiri::HTML(open(@current_url))
    @page_threads = []
    @pic_threads = []
    @download_count = 0;
    @lock = Mutex.new
  end

  def download
    begin
      # 获取相册图片数量,在子类中重载
      get_album_pic_count
    rescue InvalidAlbumError => e
      puts e.class.to_s, e.message, e.backtrace
    end
    time_start = Time.now
    puts "You are downloading pics from #{@current_url} ..."
    # 下载,也是需要重载的
    start_downloading
    puts "Downloading pics from #{@current_url} completed ..."
    puts "Costing time #{Time.now - time_start} seconds ..."
  end

  class << self
    # 工厂方法,跟前面判断URL的地方有些重复
    def factory(url)
      case
        when DOUBAN_PUBLIC_ALBUM_URL_EXP =~ url then DoubanAlbum.new(url)
        when RENREN_PUBLIC_ALBUM_URL_EXP =~ url then RenrenAlbum.new(url)
      else
        raise InvalidUrlError, "The album url #{url} is invalid!"
      end
    end
  end

  protected

  # 创建相册文件夹
  def create_album_folder(folder_name)
    if not File.exist?(folder_name)
      puts "Creating folder #{folder_name} ..."
      Dir::mkdir(folder_name)
    end
    puts "Downloaidng pics to #{folder_name}"
  end

  # 写入单个图片
  def write_file(source, dest)
    stream = open(source) { |f| f.read }
    open(dest, "wb") { |f| f.write(stream) }
  end

  # 下载任务,在子类中重载  
  def start_downloading

  end

  # 获取相册名字,要重载
  def get_folder_name

  end

  # 获取相册图片数量,要重载
  def get_album_pic_count

  end

end

# 加载子类
require File.expand_path("../douban_album", __FILE__)
require File.expand_path("../renren_album", __FILE__)
(douban_album.rb) download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class DoubanAlbum < Album

  def initialize(url)
    super(url)
    @page_item = 18
  end

  protected

  def start_downloading
    # 创建文件夹
    create_album_folder(get_folder_name)

    # 分页下载
    (@sum/@page_item + 1).times do |i|

      # 多线程下载
      @page_threads << Thread.new do
        url = @current_url + "?start=" + (i*@page_item).to_s
        page = Nokogiri::HTML(open(url))
        puts "Analyzing url #{url} ..."

        # 页内下载
        page.css(".photo_wrap a img").each do |img|

          # 每一页内也要进行多线程下载
          @pic_threads << Thread.new do
            source = img.attribute("src").to_s.sub(/thumb/, "photo")
            dest = /[\w_]+\.(jpg|jpeg|png)/i.match(source).to_s
            write_file(source, File.join(@folder_name, dest))

            # 在命令行内同步输出
            @lock.synchronize { puts "Created file #{dest} to #{@folder_name} from #{source} ... #{@download_count += 1}/#{@sum}" }
          end
        end
      end
    end

    # 线程同步
    @page_threads.each { |t| t.join }
    @pic_threads.each { |t| t.join }
  end

  def get_folder_name
    @folder_name = /\d+\/$/.match(@current_url).to_s.sub(/\//, "") + "_" + @doc.css('h1').inner_text
    # Win下要转码
    @folder_name = Iconv.iconv("GB2312", "UTF-8", @folder_name).to_s if on_windows?
  end

  def get_album_pic_count
    re = @doc.css('.wr span.pl')
    raise InvalidAlbumError, "This is an invalid ablum!" if re.empty?
    @sum = re.inner_text.to_i
  end

end
(renren_album.rb) download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class RenrenAlbum < Album

  def initialize(url)
    super(url)
    @page_item = 15
  end

  protected

  def start_downloading
    create_album_folder(get_folder_name)
    (@sum/@page_item + 1).times do |i|
      @page_threads << Thread.new do
        url = @current_url+"?curpage="+i.to_s
        page = Nokogiri::HTML(open(url))
        puts "Analyzing url #{url} ..."

        page.css("td.photoPan a").each do |link|
          @pic_threads << Thread.new do
            url = @current_url.scan(/http:\/\/[\w+\.]+/)[0].to_s + link.attribute("href").to_s
            source = open(url).read.scan(/"large":"(http:[\\\/\w.]+\.jpg)"/)[0][0].gsub(/\\/, "")
            dest = /[\w_]+\.(jpg|jpeg|png)/i.match(source).to_s
            write_file(source, File.join(@folder_name, dest))
            @lock.synchronize { puts "Created file #{dest} to #{@folder_name} from #{source} ... #{@download_count += 1}/#{@sum}" }
          end
        end

      end
    end

    @page_threads.each { |t| t.join }
    @pic_threads.each { |t| t.join }
  end

  def get_folder_name
    @folder_name = /\d+$/.match(@current_url).to_s + "_" + @doc.css("div.pager-top span h3").inner_text.to_s
    @folder_name = Iconv.iconv("GB2312", "UTF-8", @folder_name).to_s if on_windows?
  end

  def get_album_pic_count
    re = @doc.css("div.pager-top span").inner_text.to_s.scan(/共(\d+)张/)
    raise InvalidAlbumError, "This is an invalid ablum!" if re.nil?
    @sum = re[0][0].to_i
  end
end

用起来很简单,直接在命令行下输入:

1
ruby album_download.rb [豆瓣相册链接或人人相册链接]

不过这段代码再也跑不起来了,写这篇博文做个纪念吧。

最后还是想说,我喜欢Python胜过Ruby,不知道自己这辈子还有没有机会当一个Python程序员。

Have a nice day!