# twitter_friends_of_friends.rb
# by Greg Houston
# blog: http://ghouston.blogspot.com
# wiki: http://ghouston.wiki.zoho.com/
#
# this script is posted online at http://pastie.caboo.se/195011
#
# special thanks to Yoan Blanc
# http://yoan.dosimple.ch/blog/2007/05/17/
# which put this idea in my head. this code is based on his script http://yoan.dosimple.ch/blog/2007/05/17/graph2.rb
#
# twitter_friends_of_friends.rb looks at all the followers and follower's followers
# (e.g. 2nd level deep) to produce a Long Tail report.
#
# the report is output in an HTML fragment which I plan to post
# on my wiki.
#
# usage: ./ruby twitter_friends_of_friends.rb nick
# where nick is the starting nick.
# the results are written to twitter_friends_of_friends.html
#
# notes:
# hCard.find( @url ) returns an empty collection if there
# was an error page received from Twitter.
#
# the script uses a cache folder to store the downloaded html. it also
# saves the yaml for each Twitt after parsing the html. If this script is
# run multiple times, it will use the cache to avoid extra work. This makes
# things go much faster while developing this script. You will have to
# clear the cache directory if you want to get the latest from Twitter.
#
# getting the img_url can be slow (expecially since this is single threaded),
# so generating the output can take a long time. please be patient. If you
# want to speed things up, look at the lines near the end which follow #NOTE:
# comments. You can change between the Fast option which doesn't get img_urls
# for friends of friends. Or the Slow option which provides a nicer output.
#
require 'rubygems'
require 'fileutils'
require 'mofo'
require 'hpricot'
require 'pp'
require 'rio'
require 'yaml'
def say( msg )
puts msg
STDOUT.flush # get it displayed faster.
end
FileUtils.mkpath( 'cache' )
class TwittSpace
include Singleton
def TwittSpace.size
TwittSpace.instance.everyone.size
end
def TwittSpace.get( nick )
TwittSpace.instance.get( nick )
end
def initialize
@everyone = Hash.new
end
def get( nick )
return @everyone[nick] if @everyone.include? nick
yaml_file_path = 'cache/' + nick + '.yaml'
if File.exists?( yaml_file_path )
say " loading #{yaml_file_path} (#{TwittSpace.size})"
twitt = YAML::load( File.open( yaml_file_path ))
else
twitt = Twitt.new( nick )
end
@everyone[twitt.nick] = twitt
twitt
end
def TwittSpace.everyone
TwittSpace.instance.everyone
end
def everyone
@everyone
end
end
class Twitt
attr_accessor :score
attr_reader :url, :nick, :loaded, :address, :full_name
def initialize( nick )
@nick = nick
@score = 0
@url = 'http://twitter.com/' + @nick
end
def download( cache_file_path )
page = ''
3.times do
begin
page = rio( @url ).read
rescue
page = ''
next
end
break unless page =~ /Status: 500 Internal Server Error/
page = ''
end
if page.size > 0
rio( cache_file_path ) < page
end
page
end
def to_yaml_properties
(instance_variables - ["@following","@score"]).sort
end
def load
# optimization: lazy load the basic attributes
unless @loaded_base
@loaded_base = true
cache_file_path = 'cache/'+@nick+'.html'
if File.exists?( cache_file_path )
say " loading #{@nick}.html (#{TwittSpace.size})"
page = rio( cache_file_path ).read
else
say " downloading #{@nick} (#{TwittSpace.size})"
page = download( cache_file_path )
end
@contacts = hCard.find( :text => page )
@contacts = [@contacts] if @contacts.class != Array
doc = Hpricot( page )
begin
@img_url = (doc/"h2//img[@id='profile-image']")[0].attributes['src']
@full_name = (doc/"span[@class='fn']").inner_html
@address = (doc/"span[@class='adr']").inner_html
# rescue NoMethodError
# # was unable to load page
# puts " ERROR: UNABLE TO LOAD TWITTER HOMEPAGE"
# @img_url = nil
# rescue
# puts " ERROR: UNKNOWN ERROR"
# puts page # attempting to find origin of other errors.
# @img_url = nil
end
rio( 'cache/'+@nick+'.yaml' ) < self.to_yaml
end
end
def load_following
# optimization: lazy load the following attribute
# this attribute is not include in the yaml cache
# it has to be load separately from the basic
# attributes to avoid getting into recusion
unless @loaded_following
@loaded_following = true
@following = Hash.new
@contacts.each do |contact|
if contact.url =~ /^http:\/\/twitter.com\/.*/ || contact.url =~ /http:\/\/explore.twitter.com\/.*/
nick = contact.url.slice( /[^\/]*$/ )
@following[nick] = TwittSpace.get(nick)
end
end
end
end
def contacts
self.load
@contacts
end
def img_url
self.load
@img_url
end
def following
self.load
self.load_following
@following
end
end
if __FILE__==$0
say "starting..."
STDOUT.flush
root_nick = (ARGV[0].nil?) ? 'neversleep360' : ARGV[0]
root = TwittSpace.get( root_nick )
#pp root
#say "forced load..."
#root.load
#pp root
#root.load_following
#pp root
#exit
level1 = root.following # load level 1
say "scoring friends..."
level2 = Hash.new
level1.each do | nick, twitt |
say " #{nick}"
twitt.following.each do | nick2, twitt2 |
twitt2.score += 1
level2[nick2]=twitt2 unless level2.include?(nick2) || level1.include?(nick2)
end
end
say "scoring friends of friends..."
level2.each do | nick, twitt |
say " #{nick}"
twitt.following.each do | nick2, twitt2 |
twitt2.score += 1
end
end
say "\n\ncalculating rank..."
ranking = level1.values + level2.values
say " #{ranking.size} nicks"
ranking.sort! do |a,b|
result = -(a.score <=> b.score)
(result == 0) ? a.nick.casecmp( b.nick ) : result
end
say "\n\nreporting results..."
out = rio( "twitter_friends_of_friends.html" )
out < "
"
current_score = nil
column = 0
ranking.each do |buddy|
if current_score != buddy.score
out << "
" unless current_score.nil?
current_score = buddy.score
out << "
#{buddy.score}
"
column = 1
end
if column > 6
out << "
"
column = 1
end
#NOTE: fast... but not all images
#puts "