Report abuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
require 'rubygems'
require 'open-uri'
require 'net/http'
require 'hpricot'
require 'atom/entry'
require 'atom/collection'

#Declare your variables
urls_to_import = "urls.txt"
wp_blog_host = "livinglife.sweetperceptions.com"
wp_blog_uri = "http://#{wp_blog_host}"
wp_base = "http://#{wp_blog_host}/wp-app.php"
wp_blog_username = "myusername"
wp_blog_password = "mypassword"
your_blog_source = "http://sweetperceptions.i.ph"
which_pages = 1..19

authors = {
  'Maricris Nonato' => {'user' => 'myusername', 'password' => 'mypassword'}
}

registered_categories = ["About me", "Artistry", "Cool Finds", "Dreams", "Events", "Health and Beauty", "Horoscope", "Living Life", "Meme", "Movies", "Music", "Notes", "Pet Love", "Quotes", "Random thoughts", "Stories to share", "Techie", "Travel"]

synonym_categories = {
  "About me" => ["me"], 
  "Artistry" => ["poem"],
  "Cool Finds" => ["cool"],
  "Dreams" => ["dream","dreams"],
  "Events" => ["event", "bday", "birthday", "Christmas", "New year", "new-year", "celebration"],
  "Health and Beauty" => ["health", "sickness", "headache", "fever", "cancer"],
  "Horoscope" => ["cookie", "fortune", "horoscope", "astrology", "psych"],
  "Living Life" => ["life", "kalokohan"],
  "Meme" => ["meme"],
  "Movies" => ["hollywood", "movie", "movies", "movie-lines", "happy-feet"],
  "Music" => ["song", "songs", "singer", "music", "ost"],
  "Notes" => ["notes"],
  "Pet Love" => ["pet", "cat", "dog", "animal", "animals", "pets"],
  "Quotes" => ["quote", "quotes"],
  "Random thoughts" => ["thought", "thoughts", "think", "logic"],
  "Stories to share" => ["story", "stories", "adventure"],
  "Techie" => ["tech", "techie", "work", "web2.0", "development", "software", "online", "skype", "pc"],
  "Travel" => ["philippines", "travel", "province"],
}

# Rules of matching to categories:
# 1. exact match
# 2. synonyms/variations -> manual

# Get all urls of your posts
# Uncomment if you want to use Option B
# A. By scraping your links online, OR
urls = Array.new

which_pages.each do |page|
  from = Hpricot(open(your_blog_source + "/page/#{page.to_s}/"))
  urls << (from/"h3[@class='entrytitle']/a").collect{|x| x['href']}
end

# B. read in URLs from text file
# urls = File.readlines(urls_to_import).map { |line| line.chomp }

urls = urls.flatten.compact

# Parse each HTML document from list of URLs
urls.each { |target|
  doc = Hpricot(open(target))

  # Extract HTML within element matching XPath expression
  title = (CGI::unescapeHTML((doc/"div/h3[@class='entrytitle']/a").inner_html.strip)).gsub(/\r\n/, '')

  author = "Chris"

  timestr = (doc/"div[@class='meta-post']").inner_html[/\d+:\d\d:\d\d/]
  datestr = ((doc/"div/span[@class='date']").inner_html.strip).gsub(/\r\n/, '')
  datestr = datestr + " " + timestr
  datestr = DateTime.parse(datestr).strftime('%a, %-d %b %Y %T -0500')

  hExcerpt = ((doc/"div[@class='entry_summary']").inner_html).gsub(/\r\n/, '')

  filtered_tags = []
  tags = (doc/"div[@class='tag-list']/a").collect{|x| x.inner_html}

  #rule 1 -> exact match
  filtered_tags << tags.collect{|x| x if registered_categories.include?(x)}.compact

  #rule 2 -> synonyms
  synonym_categories.keys.each do |syn|
    filtered_tags << tags.collect{|x| syn if (synonym_categories[syn]).include?(x)}.compact
  end

  tags = filtered_tags.flatten.compact.uniq.join(',')

  # Get your contents by finding all paras in the entry post
  entry_id = "postentry-#{doc.at("div[@class='blog']")['id'].split('-').last}"

  # Get the main body content
  contents = (doc/"##{entry_id}")

  # Remove unneeded elements  
  (doc/"##{entry_id}/h3").remove  
  (doc/"##{entry_id}/span[@class='date']").remove  
  (doc/"##{entry_id}/div[@class='tag-list']").remove
  (doc/"##{entry_id}/div[@class='meta-post']").remove

  # removing string not found in any Hpricot element parent
  contents = (doc/"##{entry_id}").inner_html.gsub("\n        <!-- google_ad_section_start -->\n    \n    <!-- google_ad_section_end -->\n    \n        <!-- google_ad_section_start -->\n",'').gsub("\n        <!-- google_ad_section_end -->\n    \n",'')

  content = contents

  # Atom Author element
  author = Atom::Author.new
  author.name = author
  author.uri = wp_blog_uri

  # Atom Entry element      
  entry = Atom::Entry.new
  entry.title = title
  entry.summary = hExcerpt
  entry.content = content
  entry.content.type = "html"
  entry.published = datestr
  entry.updated = datestr
  entry.tag_with(tags, ',')
  entry.authors << author

  req = Atom::HTTP.new
  req.user = wp_blog_username
  req.pass = wp_blog_password
  req.always_auth = :basic

  # Atom Collection
  c = Atom::Collection.new(wp_base + "/posts", req)

  res = c.post! entry

  puts "Imported URL: #{target}, at #{datestr}, #{res.message}\n"
}