Index: test/test_parser.rb
===================================================================
--- test/test_parser.rb	(revision 159)
+++ test/test_parser.rb	(working copy)
@@ -67,8 +67,8 @@
     assert_equal 'link1', (doc/:p/:a).first['id']
     assert_equal 'link1', doc.search('p').at('a').get_attribute('id')
     assert_equal 'link2', (doc/'p').filter('.ohmy').search('a').first.get_attribute('id')
-    assert_equal (doc/'p')[2], (doc/'p').filter(':nth(2)')[0]
-    assert_equal (doc/'p')[2], (doc/'p').filter('[3]')[0]
+    assert_equal((doc/'p')[2], (doc/'p').filter(':nth(2)')[0])
+    assert_equal((doc/'p')[2], (doc/'p').filter('[3]')[0])
     assert_equal 4, (doc/'p').filter('*').length
     assert_equal 4, (doc/'p').filter('* *').length
     eles = (doc/'p').filter('.ohmy')
@@ -373,7 +373,7 @@
     
     END
     doc = Hpricot::XML(chunk)
-    assert (doc/"//t:sam").size > 0 # at least this should probably work
+    assert ((doc/"//t:sam").size > 0) # at least this should probably work
     # assert (doc/"//sam").size > 0  # this would be nice 
   end
 end
Index: ext/hpricot_scan/hpricot_scan.rl
===================================================================
--- ext/hpricot_scan/hpricot_scan.rl	(revision 159)
+++ ext/hpricot_scan/hpricot_scan.rl	(working copy)
@@ -8,19 +8,37 @@
  */
 #include 
 
+/* If using Ruby earlier than 1.9.0 */
 #ifndef RARRAY_LEN
-#define RARRAY_LEN(arr)  RARRAY(arr)->len
-#define RSTRING_LEN(str) RSTRING(str)->len
-#define RSTRING_PTR(str) RSTRING(str)->ptr
+	#define RARRAY_LEN(ptr) RARRAY(ptr)->len
+	#define RARRAY_PTR(ptr) RARRAY(ptr)->ptr
+	
+	#define RSTRING_EMBED_LEN_MAX ((sizeof(VALUE)*3)/sizeof(char)-1)
+	#define RSTRING_NOEMBED FL_USER1
+	#define RSTRING_EMBED_LEN_MASK (FL_USER2|FL_USER3|FL_USER4|FL_USER5|FL_USER6)
+	#define RSTRING_EMBED_LEN_SHIFT (FL_USHIFT+2)
+	#define RSTRING_LEN(str) \
+	    (!(RBASIC(str)->flags & RSTRING_NOEMBED) ? \
+	     (long)((RBASIC(str)->flags >> RSTRING_EMBED_LEN_SHIFT) & \
+	            (RSTRING_EMBED_LEN_MASK >> RSTRING_EMBED_LEN_SHIFT)) : \
+	     RSTRING(str)->as.heap.len)
+	#define RSTRING_PTR(str) \
+	    (!(RBASIC(str)->flags & RSTRING_NOEMBED) ? \
+	     RSTRING(str)->as.ary : \
+	     RSTRING(str)->as.heap.ptr)
+	#define RSTRING_END(str) (RSTRING_PTR(str)+RSTRING_LEN(str))
 #endif
 
 #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net.  So sorry!"
 
 static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
       sym_cdata, sym_text;
+
 static VALUE rb_eHpricotParseError;
+
 static ID s_read, s_to_str;
 
+/* Collect raw content as ruby string and yield it to block passed to Hpricot.scan */
 #define ELE(N) \
   if (tokend > tokstart || text == 1) { \
     VALUE raw_string = Qnil; \
@@ -38,9 +56,9 @@
     N = rb_str_new(mark_##N, E - mark_##N);
 
 #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
-
 #define SLIDE(N) if ( mark_##N > tokstart ) mark_##N = buf + (mark_##N - tokstart);
 
+/* Creates Hash of attributes from key/value pair */
 #define ATTR(K, V) \
     if (!NIL_P(K)) { \
       if (NIL_P(attr)) attr = rb_hash_new(); \
@@ -116,6 +134,36 @@
 
 #define BUFSIZE 16384
 
+/*
+ * Creates a 4 elements array for token and yields it to the block/proc given to Hpricot.scan method in Ruby.
+ * _why explains it on his old blog, Redhanded:
+ *
+ * (1) a symbol describing the element type, 
+ * (2) the tag name or text content, 
+ * (3) an attributes hash, 
+ * (4) the raw string which formed this token.
+ * 
+ * See http://redhanded.hobix.com/inspect/okayGiveHpricot02AGo.html for more details.
+ *
+ * Element types:
+ *
+ *  1. stag       — starting tag like 
+ * 2. etag — ending tag like
+ * 3. text — text node like Hpricot is a loosy HTML parser written in Ruby and C + * 4. emtpytag — empty tag note like
+ * 5. comment — comment node like + * 6. xmldecl + * + * Examples (using some tokens from test/files/boingboing.html in Hpricot fixture files) + * + * [:stag, "a", {"href"=>"http://www.pageflakes.com/subscribe.aspx?url=http://feeds.feedburner.com/boingboing/iBag", "title"=>"Boing Boing", "type"=>"application/rss+xml"}, ""] + * [:emptytag, "img", {"src"=>"http://www.boingboing.net/images/pageflakes.gif", "height"=>"17", "alt"=>"Subscribe in Pageflakes", "style"=>"border:0", "width"=>"81"}, "\"Subscribe"] + * [:etag, "a", nil, ""] + * [:text, " on how the two of them found an entirely new song of Scott Joplin's. (its the fourth of four mini-segments.)\n\n\n", nil, " on how the two of them found an entirely new song of Scott Joplin's. (its the fourth of four mini-segments.)\n\n\n"] + * [:comment, " Begin: AdBrite ", nil, nil] + * [:cdata, " predefined content comes here ", nil, nil] + * + */ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint) { VALUE ary; @@ -136,11 +184,9 @@ { int cs, act, have = 0, nread = 0, curline = 1, text = 0; char *tokstart = 0, *tokend = 0, *buf = NULL; - VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil; char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0; int done = 0, ele_open = 0, buffer_size = 0; - int taint = OBJ_TAINTED( port ); if ( !rb_respond_to( port, s_read ) ) { @@ -154,7 +200,6 @@ rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." ); } } - buffer_size = BUFSIZE; if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) { bufsize = rb_ivar_get(self, rb_intern("@buffer_size")); @@ -163,21 +208,18 @@ } } buf = ALLOC_N(char, buffer_size); + %% write init; - %% write init; - while ( !done ) { VALUE str; char *p = buf + have, *pe; int len, space = buffer_size - have; - if ( space == 0 ) { /* We've used up the entire buffer storing an already-parsed token * prefix that must be preserved. Likely caused by super-long attributes. * See ticket #13. */ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline); } - if ( rb_respond_to( port, s_read ) ) { str = rb_funcall( port, s_read, 1, INT2FIX(space) ); @@ -186,21 +228,18 @@ { str = rb_str_substr( port, nread, space ); } - StringValue(str); memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) ); len = RSTRING_LEN(str); nread += len; - /* If this is the last buffer, tack on an EOF. */ if ( len < space ) { p[len++] = 0; done = 1; } - pe = p + len; %% write exec; - + if ( cs == hpricot_scan_error ) { free(buf); if ( !NIL_P(tag) ) @@ -212,17 +251,17 @@ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline); } } - + if ( done && ele_open ) { ele_open = 0; + if (tokstart > 0) { mark_tag = tokstart; tokstart = 0; text = 1; } } - if ( tokstart == 0 ) { have = 0; @@ -257,23 +296,37 @@ } free(buf); } - +/* Initializes C extension */ void Init_hpricot_scan() { + /* Define a ruby module named Hpricot */ VALUE mHpricot = rb_define_module("Hpricot"); + /* Define a singleton method on it */ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1); + /* Associate that singleton method with scan function in C */ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1); + /* Register Ruby exception class under Hpricot module with Exception it's parent */ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException); - + /* Get internal VM id of read string */ s_read = rb_intern("read"); + /* Get internal VM id of to_str string */ s_to_str = rb_intern("to_str"); + /* Get :xmldecl symbol via VM id of string */ sym_xmldecl = ID2SYM(rb_intern("xmldecl")); + /* Get :doctype symbol via VM id of string */ sym_doctype = ID2SYM(rb_intern("doctype")); + /* Get :procins symbol via VM id of string */ sym_procins = ID2SYM(rb_intern("procins")); + /* Get :stag symbol via VM id of string */ sym_stag = ID2SYM(rb_intern("stag")); + /* Get :etag symbol via VM id of string */ sym_etag = ID2SYM(rb_intern("etag")); + /* Get :emptytag symbol via VM id of string */ sym_emptytag = ID2SYM(rb_intern("emptytag")); + /* Get :comment symbol via VM id of string */ sym_comment = ID2SYM(rb_intern("comment")); + /* Get :cdata symbol via VM id of string */ sym_cdata = ID2SYM(rb_intern("cdata")); + /* Get :text symbol via VM id of string */ sym_text = ID2SYM(rb_intern("text")); } Index: lib/hpricot/tags.rb =================================================================== --- lib/hpricot/tags.rb (revision 159) +++ lib/hpricot/tags.rb (working copy) @@ -14,10 +14,11 @@ Attrs = AttrCore + AttrI18n + AttrEvents # All the tags and attributes from XHTML 1.0 Strict - class XHTMLStrict + class XHTMLStrict class << self attr_accessor :tags, :tagset, :forms, :self_closing, :doctype end + @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"] @tagset = { :html => AttrI18n + [:id, :xmlns], Index: lib/hpricot/parse.rb =================================================================== --- lib/hpricot/parse.rb (revision 159) +++ lib/hpricot/parse.rb (working copy) @@ -1,5 +1,6 @@ require 'hpricot/htmlinfo' +# Shortcut for Hpricot.parse def Hpricot(input = nil, opts = {}, &blk) Hpricot.parse(input, opts, &blk) end Index: lib/hpricot/tag.rb =================================================================== --- lib/hpricot/tag.rb (revision 159) +++ lib/hpricot/tag.rb (working copy) @@ -21,9 +21,11 @@ class BaseEle attr_accessor :raw_string, :parent + def html_quote(str) "\"" + str.gsub('"', '\\"') + "\"" end + def if_output(opts) if opts[:preserve] and not @raw_string.nil? @raw_string @@ -31,10 +33,15 @@ yield opts end end - def pathname; self.name end + + def pathname; + self.name + end + def altered! @raw_string = nil end + def self.alterable(*fields) attr_accessor(*fields) fields.each do |f| @@ -48,14 +55,20 @@ class Elem attr_accessor :stag, :etag, :children + def initialize(stag, children=nil, etag=nil) @stag, @etag = stag, etag @children = children ? children.each { |c| c.parent = self } : [] end - def empty?; @children.empty? end + + def empty?; + @children.empty? + end + [:name, :raw_attributes, :parent, :altered!].each do |m| [m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } } end + def attributes if raw_attributes raw_attributes.inject({}) do |hsh, (k, v)| @@ -64,6 +77,7 @@ end end end + def to_plain_text if self.name == 'br' "\n" @@ -77,7 +91,11 @@ super end end - def pathname; self.name end + + def pathname; + self.name + end + def output(out, opts = {}) if empty? and ElementContent[@stag.name] == :EMPTY @stag.output(out, opts.merge(:style => :empty)) @@ -93,13 +111,17 @@ out end end - + + # Start tag class STag < BaseEle + def initialize(name, attributes=nil) @name = name.to_s @raw_attributes = attributes || {} end + alterable :name, :raw_attributes + def attributes_as_html if @raw_attributes @raw_attributes.map do |aname, aval| @@ -108,6 +130,7 @@ end.join end end + def output(out, opts = {}) out << if_output(opts) do @@ -117,12 +140,16 @@ end end end - + + # End tag class ETag < BaseEle + def initialize(qualified_name) @name = qualified_name.to_s end + alterable :name + def output(out, opts = {}) out << if_output(opts) do @@ -134,16 +161,23 @@ class BogusETag < ETag def output(out, opts = {}); out << if_output(opts) { '' }; end end - + + # Text node class Text < BaseEle def initialize(text) @content = text end + alterable :content - def pathname; "text()" end + + def pathname + "text()" + end + def to_s Hpricot.uxs(@content) end + alias_method :inner_text, :to_s alias_method :to_plain_text, :to_s def output(out, opts = {}) @@ -153,10 +187,12 @@ end end end - + + # CDATA section class CData < Text alias_method :to_s, :content alias_method :to_plain_text, :content + def output(out, opts = {}) out << if_output(opts) do @@ -164,13 +200,20 @@ end end end - + + # XML declaration class XMLDecl < BaseEle + def initialize(version, encoding, standalone) @version, @encoding, @standalone = version, encoding, standalone end + alterable :version, :encoding, :standalone - def pathname; "xmldecl()" end + + def pathname + "xmldecl()" + end + def output(out, opts = {}) out << if_output(opts) do @@ -181,13 +224,19 @@ end end end - + + # DOCTYPE declaration class DocType < BaseEle def initialize(target, pubid, sysid) @target, @public_id, @system_id = target, pubid, sysid end + alterable :target, :public_id, :system_id - def pathname; "doctype()" end + + def pathname + "doctype()" + end + def output(out, opts = {}) out << if_output(opts) do @@ -199,11 +248,17 @@ end class ProcIns < BaseEle + def initialize(target, content) @target, @content = target, content end - def pathname; "procins()" end + + def pathname + "procins()" + end + alterable :target, :content + def output(out, opts = {}) out << if_output(opts) do @@ -218,8 +273,13 @@ def initialize(content) @content = content end - def pathname; "comment()" end + + def pathname + "comment()" + end + alterable :content + def output(out, opts = {}) out << if_output(opts) do Index: lib/hpricot/elements.rb =================================================================== --- lib/hpricot/elements.rb (revision 159) +++ lib/hpricot/elements.rb (working copy) @@ -1,54 +1,54 @@ module Hpricot -# Once you've matched a list of elements, you will often need to handle them as -# a group. Or you may want to perform the same action on each of them. -# Hpricot::Elements is an extension of Ruby's array class, with some methods -# added for altering elements contained in the array. -# -# If you need to create an element array from regular elements: -# -# Hpricot::Elements[ele1, ele2, ele3] -# -# Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem, -# Hpricot::Doc, etc.) -# -# == Continuing Searches -# -# Usually the Hpricot::Elements you're working on comes from a search you've -# done. Well, you can continue searching the list by using the same at -# and search methods you can use on plain elements. -# -# elements = doc.search("/div/p") -# elements = elements.search("/a[@href='http://hoodwink.d/']") -# elements = elements.at("img") -# -# == Altering Elements -# -# When you're altering elements in the list, your changes will be reflected in -# the document you started searching from. -# -# doc = Hpricot("That's my spoon, Tyler.") -# doc.at("b").swap("fork") -# doc.to_html -# #=> "That's my fork, Tyler." -# -# == Getting More Detailed -# -# If you can't find a method here that does what you need, you may need to -# loop through the elements and find a method in Hpricot::Container::Trav -# which can do what you need. -# -# For example, you may want to search for all the H3 header tags in a document -# and grab all the tags underneath the header, but not inside the header. -# A good method for this is next_sibling: -# -# doc.search("h3").each do |h3| -# while ele = h3.next_sibling -# ary << ele # stuff away all the elements under the h3 -# end -# end -# -# Most of the useful element methods are in the mixins Hpricot::Traverse -# and Hpricot::Container::Trav. + # Once you've matched a list of elements, you will often need to handle them as + # a group. Or you may want to perform the same action on each of them. + # Hpricot::Elements is an extension of Ruby's array class, with some methods + # added for altering elements contained in the array. + # + # If you need to create an element array from regular elements: + # + # Hpricot::Elements[ele1, ele2, ele3] + # + # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem, + # Hpricot::Doc, etc.) + # + # == Continuing Searches + # + # Usually the Hpricot::Elements you're working on comes from a search you've + # done. Well, you can continue searching the list by using the same at + # and search methods you can use on plain elements. + # + # elements = doc.search("/div/p") + # elements = elements.search("/a[@href='http://hoodwink.d/']") + # elements = elements.at("img") + # + # == Altering Elements + # + # When you're altering elements in the list, your changes will be reflected in + # the document you started searching from. + # + # doc = Hpricot("That's my spoon, Tyler.") + # doc.at("b").swap("fork") + # doc.to_html + # #=> "That's my fork, Tyler." + # + # == Getting More Detailed + # + # If you can't find a method here that does what you need, you may need to + # loop through the elements and find a method in Hpricot::Container::Trav + # which can do what you need. + # + # For example, you may want to search for all the H3 header tags in a document + # and grab all the tags underneath the header, but not inside the header. + # A good method for this is next_sibling: + # + # doc.search("h3").each do |h3| + # while ele = h3.next_sibling + # ary << ele # stuff away all the elements under the h3 + # end + # end + # + # Most of the useful element methods are in the mixins Hpricot::Traverse + # and Hpricot::Container::Trav. class Elements < Array # Searches this list for any elements (or children of these elements) matching @@ -198,7 +198,7 @@ # # This example adds a #top anchor to each link. # - def attr key, value = nil, &blk + def attr(key, value = nil, &blk) if value or blk each do |el| el.set_attribute(key, value || blk[el]) @@ -219,7 +219,7 @@ # (doc/"p").add_class("bacon") # # Now all paragraphs will have class="bacon". - def add_class class_name + def add_class(class_name) each do |el| next unless el.respond_to? :get_attribute classes = el.get_attribute('class').to_s.split(" ") @@ -232,7 +232,7 @@ # # (doc/"input").remove_attr("disabled") # - def remove_attr name + def remove_attr(name) each do |el| next unless el.respond_to? :remove_attribute el.remove_attribute(name) @@ -240,15 +240,15 @@ self end - # Removes a class from all matched elements. + # Removes a CSS class from all matched elements. # # (doc/"span").remove_class("lightgrey") # - # Or, to remove all classes: + # Or, to remove all CSS classes: # # (doc/"span").remove_class # - def remove_class name = nil + def remove_class(name = nil) each do |el| next unless el.respond_to? :get_attribute if name @@ -260,55 +260,114 @@ end self end - + alias_method :remove_css_class, :remove_class + + # Regexp to parse attribute selectors like li[@class='search_item'] ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^\]'"]*)'?"? *\]!i + # [ ] BRACK_RE = %r!(\[) *([^\]]*) *\]+!i + # Function like :not(...) or :first(...) or :last(...) FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)! + # Custom function CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()! CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)! def self.filter(nodes, expr, truth = true) - until expr.empty? - _, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/) - break unless _ - - expr = $' - m.compact! - if m[0] == '@' - m[0] = "@#{m.slice!(2,1)}" + until expr.empty? + # Take groups out of match + # + # Continuing with example of li[@class='search_item'] m will contain the following: + # + # ["@", "class", nil, "=", "search_item"] + # + match, *match_groups = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/) + + # Break if no match happened at all + break unless match + + # Post match + expr = $' + # Remove nils + match_groups.compact! + + origin = match_groups.dup + + # If dealing with attribute selector + if match_groups[0] == '@' + # In example above ["@", "class", "=", "search_item"] becomes ["@=", "class", "search_item"] + # Array#to_s behaviour changed in Ruby 1.9.0 + match_groups[0] = "@#{[match_groups.slice!(2, 1)].flatten.first}" + end + + step_one = match_groups.dup + + # Argument looks like this: ':nth(2)' + # Or like this: '[3]' + if match_groups[0] == '[' && match_groups[1] =~ /^\d+$/ + match_groups = [":", "nth", match_groups[1].to_i-1] + end + + # :not function + # argument looks like this: 'p:not(.ohmy)' + if match_groups[0] == ":" && match_groups[1] == "not" + nodes, = Elements.filter(nodes, match_groups[2], false) + # :even or :odd functions + # argument looks like this: 'p:even' or 'p:odd' + elsif "#{match_groups[0]}#{match_groups[1]}" =~ /^(:even|:odd)$/ + nodes = select_even_or_idd_nodes(nodes, match_groups[1]) + + # :first or :last functions + # argument looks like this: 'p:last' or 'p:first' + # just take first or last element from nodes array + elsif "#{match_groups[0]}#{match_groups[1]}" =~ /^(:first|:last)$/ + nodes = [nodes.send(match_groups[1])] + else + meth = "filter[#{match_groups[0]}#{match_groups[1]}]" unless match_groups[0].empty? + #puts "origin: #{origin.inspect} => #{step_one} => #{match_groups[0]} and #{meth}" if ENV['TRACE'] + + # If traversing method we are going to use is known + if meth and Traverse.method_defined?(meth) + args = match_groups[2..-1] + else + # filter[@=] or filter[text()~=] or things like that + meth = "filter[#{match_groups[0]}]" + # If traversing method we are going to use is known + if Traverse.method_defined? meth + args = match_groups[1..-1] + # puts "Traverse method defined, args: #{args}" end - - if m[0] == '[' && m[1] =~ /^\d+$/ - m = [":", "nth", m[1].to_i-1] - end - - if m[0] == ":" && m[1] == "not" - nodes, = Elements.filter(nodes, m[2], false) - elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/ - new_nodes = [] - nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) } - nodes = new_nodes - elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/ - nodes = [nodes.send(m[1])] - else - meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty? - if meth and Traverse.method_defined? meth - args = m[2..-1] - else - meth = "filter[#{m[0]}]" - if Traverse.method_defined? meth - args = m[1..-1] - end - end - i = -1 - nodes = Elements[*nodes.find_all do |x| - i += 1 - x.send(meth, *([*args] + [i])) ? truth : !truth - end] - end + end + + i = -1 + nodes = Elements[*nodes.find_all do |node| + i += 1 + # like p, 0 + # or alt, 200606131240, 68 + filtering_args = ([*args] + [i]) + + begin + res = node.send(meth, *filtering_args) + rescue Exception => e + puts "DEBUG => Node: #{node}" + puts "DEBUG => Groups:#{match_groups.join(" ")}" + puts("DEBUG => Arguments to send: " + filtering_args.join(" ")) + puts "DEBUG => Method: #{meth}" + puts "DEBUG => Result of send: #{res}\n\n" + end + + res ? truth : !truth + end] end - [nodes, expr] + end + [nodes, expr] end + + def self.select_even_or_odd_nodes(nodes, oddity = :odd) + result = [] + nodes.each_with_index { |node, i| result.push(node) if (i % 2 == (oddity.to_sym == :even ? 0 : 1)) } + result + end + # Given two elements, attempt to gather an Elements array of everything between # (and including) those two elements. @@ -345,164 +404,242 @@ end def filter(expr) - nodes, = Elements.filter(self, expr) - nodes + nodes, = Elements.filter(self, expr) + nodes end def not(expr) - if expr.is_a? Traverse - nodes = self - [expr] - else - nodes, = Elements.filter(self, expr, false) - end - nodes + if expr.is_a? Traverse + nodes = self - [expr] + else + nodes, = Elements.filter(self, expr, false) + end + nodes end private def copy_node(node, l) - l.instance_variables.each do |iv| - node.instance_variable_set(iv, l.instance_variable_get(iv)) - end + l.instance_variables.each do |iv| + node.instance_variable_set(iv, l.instance_variable_get(iv)) + end end end module Traverse - def self.filter(tok, &blk) - define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk) + # Defines methods doing actual has_elements_filtering. Hpricot.has_elements_filter uses it to has_elements_filter out elements. + def self.has_elements_filter(token, &blk) + # Possible methods to be defined: + # + # filter[] + # filter[#] + # filter[.] + # filter[:lt] + # filter[:gt] + # filter[:nth] + # filter[:eq] + # filter[:nth-of-type] + # filter[:first] + # filter[:first-of-type] + # filter[:last] + # filter[:last-of-type] + # filter[:even] + # filter[:odd] + # filter[:first-child] + # filter[:nth-child] + # filter[:last-child] + # filter[:nth-last-of-type] + # filter[:nth-last-child] + # filter[:only-of-type] + # filter[:only-child] + # filter[:parent] + # filter[:root] + # filter[:empty] + # filter[text] + # filter[comment] + # filter[:contains] + # + # The following filters operate on attributes: + # + # filter[@=] + # filter[@!=] + # filter[@~=] + # filter[@|=] + # filter[@^=] + # filter[@$=] + # filter[@*=] + # + # These operate on text: + # + # filter[text()=] + # filter[text()!=] + # filter[text()~=] + # filter[text()|=] + # filter[text()^=] + # filter[text()$=] + # filter[text()*=] + define_method("filter[#{stringify_filter_token(token)}]", &blk) end - - filter '' do |name,i| + + def self.stringify_filter_token(token) + token.is_a?(String) ? token : token.inspect + end + + # get elements by tag name (case insensitive) + has_elements_filter '' do |name, node_position| name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase) end - - filter '#' do |id,i| - self.elem? and get_attribute('id').to_s == id + + # get element by element id + has_elements_filter '#' do |id, i| + self.elem? && (get_attribute('id').to_s == id) end - - filter '.' do |name,i| - self.elem? and classes.include? name + + # get elements with given class + has_elements_filter '.' do |css_class, node_position| + self.elem? && classes.include?(css_class) end - - filter :lt do |num,i| - self.position < num.to_i + + # get elements by position (elements with position greater than i are left out) + has_elements_filter :lt do |node_position, _| + self.position < node_position.to_i end - filter :gt do |num,i| - self.position > num.to_i + has_elements_filter :gt do |node_position, _| + self.position > node_position.to_i end - nth = proc { |num,i| self.position == num.to_i } - nth_first = proc { |*a| self.position == 0 } - nth_last = proc { |*a| self == parent.children_of_type(self.name).last } - - filter :nth, &nth - filter :eq, &nth - filter ":nth-of-type", &nth - - filter :first, &nth_first - filter ":first-of-type", &nth_first - - filter :last, &nth_last - filter ":last-of-type", &nth_last - - filter :even do |num,i| - self.position % 2 == 0 - end - - filter :odd do |num,i| - self.position % 2 == 1 - end - - filter ':first-child' do |i| + nth = proc { |node_position, _| self.position == node_position.to_i } + nth_first = proc { |*arguments_we_dont_care_about_here| self.position == 0 } + nth_last = proc { |*arguments_we_dont_care_about_here| self == parent.children_of_type(self.name).last } + evens = proc { |*arguments_we_dont_care_about_here| self.position % 2 == 0 } + odds = proc { |*arguments_we_dont_care_about_here| self.position % 2 == 1 } + + # get nth element in collection + has_elements_filter :nth, &nth + has_elements_filter :eq, &nth + has_elements_filter ":nth-of-type", &nth + + # get first element in collection + has_elements_filter :first, &nth_first + has_elements_filter ":first-of-type", &nth_first + + # get last element in collection + has_elements_filter :last, &nth_last + has_elements_filter ":last-of-type", &nth_last + + # get only even elements + has_elements_filter :even, &evens + # get only odd elements + has_elements_filter :odd, &odds + + # get only elements that are first children of respective parents + has_elements_filter ':first-child' do self == parent.containers.first end - - filter ':nth-child' do |arg,i| - case arg - when 'even'; (parent.containers.index(self) + 1) % 2 == 0 - when 'odd'; (parent.containers.index(self) + 1) % 2 == 1 - else self == (parent.containers[arg.to_i + 1]) - end + + # get only elements that are nth children of respective parents + has_elements_filter ':nth-child' do |node_position, i| + self == (parent.containers[node_position.to_i + 1]) end - - filter ":last-child" do |i| + + # get only elements that are last children of respective parents + has_elements_filter ":last-child" do self == parent.containers.last end - filter ":nth-last-child" do |arg,i| - self == parent.containers[-1-arg.to_i] + # get only elements that are nth last (like nth from the opposite end) children of respective parents + has_elements_filter ":nth-last-child" do |node_position, _| + self == parent.containers[-1 - node_position.to_i] end - - filter ":nth-last-of-type" do |arg,i| - self == parent.children_of_type(self.name)[-1-arg.to_i] + + # get only elements that are nth last (like nth from the opposite end) children of given type of respective parents + has_elements_filter ":nth-last-of-type" do |node_position, _| + self == parent.children_of_type(self.name)[-1 - node_position.to_i] end - - filter ":only-of-type" do |arg,i| + + # get only elements that are of given type + has_elements_filter ":only-of-type" do parent.children_of_type(self.name).length == 1 end - - filter ":only-child" do |arg,i| + + # get only child elements + has_elements_filter ":only-child" do |*ignore| parent.containers.length == 1 end - - filter :parent do + + has_elements_filter ":only-children" do |*ignore| + parent.containers.length == 1 + end + + # get only parent elements + has_elements_filter :parent do |*ignore| containers.length > 0 end - - filter :empty do + + # get only empty elements (without children at all) + has_elements_filter :empty do |*ignore| containers.length == 0 end - - filter :root do + + # get root element + has_elements_filter :root do |*ignore| self.is_a? Hpricot::Doc end - filter 'text' do + # get text elements + has_elements_filter 'text' do |*ignore| self.text? end - - filter 'comment' do + + # get comment elements + has_elements_filter 'comment' do |*ignore| self.comment? end - - filter :contains do |arg, ignore| + + # get elements containing given tag + has_elements_filter :contains do |arg, ignore| html.include? arg end + + predicate_procs = { + 'text()' => proc { |ele, *_| ele.inner_text.strip }, + '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? } + } + operations_procs = { + '=' => proc { |a,b| a == b }, + '!=' => proc { |a,b| a != b }, + '~=' => proc { |a,b| a.split(/\s+/).include?(b) }, + '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ }, + '^=' => proc { |a,b| a.index(b) == 0 }, + '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ }, + '*=' => proc { |a,b| idx = a.index(b) } + } - - pred_procs = - {'text()' => proc { |ele, *_| ele.inner_text.strip }, - '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }} - - oper_procs = - {'=' => proc { |a,b| a == b }, - '!=' => proc { |a,b| a != b }, - '~=' => proc { |a,b| a.split(/\s+/).include?(b) }, - '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ }, - '^=' => proc { |a,b| a.index(b) == 0 }, - '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ }, - '*=' => proc { |a,b| idx = a.index(b) }} - - pred_procs.each do |pred_n, pred_f| - oper_procs.each do |oper_n, oper_f| - filter "#{pred_n}#{oper_n}" do |*a| - qual = pred_f[self, *a] - oper_f[qual, a[-2]] if qual + # construct filters like @=, @!=, text()~=, text()*= and so forth + predicate_procs.each do |predicate_name, predicate_function| + operations_procs.each do |operation_name, operation_function| + + has_elements_filter "#{predicate_name}#{operation_name}" do |*a| + qual = predicate_function[self, *a] + operation_function[qual, a[-2]] if qual end + end end - - filter 'text()' do |val,i| + + # get elements containing text + has_elements_filter 'text()' do |val, i| !self.inner_text.strip.empty? end - - filter '@' do |attr,val,i| + + # get elements that have given attribute + has_elements_filter '@' do |attr, val, i| self.elem? and has_attribute? attr end - - filter '[' do |val,i| + + has_elements_filter '[' do |val, i| self.elem? and search(val).length > 0 end