class Crass::Tokenizer

Tokenizes a CSS string.

  1. dev.w3.org/csswg/css-syntax/#tokenization

Constants

RE_COMMENT_CLOSE
RE_DIGIT
RE_ESCAPE
RE_HEX
RE_NAME
RE_NAME_START
RE_NON_PRINTABLE
RE_NUMBER_DECIMAL
RE_NUMBER_EXPONENT
RE_NUMBER_SIGN
RE_NUMBER_STR
RE_QUOTED_URL_START
RE_UNICODE_RANGE_END
RE_UNICODE_RANGE_START
RE_WHITESPACE
RE_WHITESPACE_ANCHORED

Public Class Methods

new(input, options = {}) click to toggle source

Initializes a new Tokenizer.

Options:

* **:preserve_comments** - If `true`, comments will be preserved as
  `:comment` tokens.

* **:preserve_hacks** - If `true`, certain non-standard browser hacks
  such as the IE "*" hack will be preserved even though they violate
  CSS 3 syntax rules.
# File lib/crass/tokenizer.rb, line 62
def initialize(input, options = {})
  @s       = Scanner.new(preprocess(input))
  @options = options
end
tokenize(input, options = {}) click to toggle source

Tokenizes the given input as a CSS string and returns an array of tokens.

See {#initialize} for options.

# File lib/crass/tokenizer.rb, line 45
def self.tokenize(input, options = {})
  Tokenizer.new(input, options).tokenize
end

Public Instance Methods

consume() click to toggle source

Consumes a token and returns the token that was consumed.

4.3.1. dev.w3.org/csswg/css-syntax/#consume-a-token

# File lib/crass/tokenizer.rb, line 70
def consume
  return nil if @s.eos?

  @s.mark

  # Consume comments.
  if comment_token = consume_comments
    if @options[:preserve_comments]
      return comment_token
    else
      return consume
    end
  end

  # Consume whitespace.
  return create_token(:whitespace) if @s.scan(RE_WHITESPACE)

  char = @s.consume

  case char.to_sym
  when :'"'
    consume_string

  when :'#'
    if @s.peek =~ RE_NAME || valid_escape?(@s.peek(2))
      create_token(:hash,
        :type  => start_identifier?(@s.peek(3)) ? :id : :unrestricted,
        :value => consume_name)
    else
      create_token(:delim, :value => char)
    end

  when :'$'
    if @s.peek == '='
      @s.consume
      create_token(:suffix_match)
    else
      create_token(:delim, :value => char)
    end

  when :"'"
    consume_string

  when :'('
    create_token(:'(')

  when :')'
    create_token(:')')

  when :*
    if @s.peek == '='
      @s.consume
      create_token(:substring_match)

    # Non-standard: Preserve the IE * hack.
    elsif @options[:preserve_hacks] && @s.peek =~ RE_NAME_START
      @s.reconsume
      consume_ident

    else
      create_token(:delim, :value => char)
    end

  when :+
    if start_number?
      @s.reconsume
      consume_numeric
    else
      create_token(:delim, :value => char)
    end

  when :','
    create_token(:comma)

  when :-
    nextTwoChars   = @s.peek(2)
    nextThreeChars = char + nextTwoChars

    if start_number?(nextThreeChars)
      @s.reconsume
      consume_numeric
    elsif nextTwoChars == '->'
      @s.consume
      @s.consume
      create_token(:cdc)
    elsif start_identifier?(nextThreeChars)
      @s.reconsume
      consume_ident
    else
      create_token(:delim, :value => char)
    end

  when :'.'
    if start_number?
      @s.reconsume
      consume_numeric
    else
      create_token(:delim, :value => char)
    end

  when :':'
    create_token(:colon)

  when :';'
    create_token(:semicolon)

  when :<
    if @s.peek(3) == '!--'
      @s.consume
      @s.consume
      @s.consume

      create_token(:cdo)
    else
      create_token(:delim, :value => char)
    end

  when :'@'
    if start_identifier?(@s.peek(3))
      create_token(:at_keyword, :value => consume_name)
    else
      create_token(:delim, :value => char)
    end

  when :'['
    create_token(:'[')

  when :'\\'
    if valid_escape?
      @s.reconsume
      consume_ident
    else
      # Parse error.
      create_token(:delim,
        :error => true,
        :value => char)
    end

  when :']'
    create_token(:']')

  when :'^'
    if @s.peek == '='
      @s.consume
      create_token(:prefix_match)
    else
      create_token(:delim, :value => char)
    end

  when :'{'
    create_token(:'{')

  when :'}'
    create_token(:'}')

  when :U, :u
    if @s.peek(2) =~ RE_UNICODE_RANGE_START
      @s.consume
      consume_unicode_range
    else
      @s.reconsume
      consume_ident
    end

  when :|
    case @s.peek
    when '='
      @s.consume
      create_token(:dash_match)

    when '|'
      @s.consume
      create_token(:column)

    else
      create_token(:delim, :value => char)
    end

  when :~
    if @s.peek == '='
      @s.consume
      create_token(:include_match)
    else
      create_token(:delim, :value => char)
    end

  else
    case char
    when RE_DIGIT
      @s.reconsume
      consume_numeric

    when RE_NAME_START
      @s.reconsume
      consume_ident

    else
      create_token(:delim, :value => char)
    end
  end
end
consume_bad_url() click to toggle source

Consumes the remnants of a bad URL and returns the consumed text.

4.3.15. dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url

# File lib/crass/tokenizer.rb, line 275
def consume_bad_url
  text = String.new

  until @s.eos?
    if valid_escape?
      text << consume_escaped
    elsif valid_escape?(@s.peek(2))
      @s.consume
      text << consume_escaped
    else
      char = @s.consume

      if char == ')'
        break
      else
        text << char
      end
    end
  end

  text
end
consume_comments() click to toggle source

Consumes comments and returns them, or `nil` if no comments were consumed.

4.3.2. dev.w3.org/csswg/css-syntax/#consume-comments

# File lib/crass/tokenizer.rb, line 301
def consume_comments
  if @s.peek(2) == '/*'
    @s.consume
    @s.consume

    if text = @s.scan_until(RE_COMMENT_CLOSE)
      text.slice!(-2, 2)
    else
      # Parse error.
      text = @s.consume_rest
    end

    return create_token(:comment, :value => text)
  end

  nil
end
consume_escaped() click to toggle source

Consumes an escaped code point and returns its unescaped value.

This method assumes that the `` has already been consumed, and that the next character in the input has already been verified not to be a newline or EOF.

4.3.8. dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point

# File lib/crass/tokenizer.rb, line 326
def consume_escaped
  return "\ufffd" if @s.eos?

  if hex_str = @s.scan(RE_HEX)
    @s.consume if @s.peek =~ RE_WHITESPACE

    codepoint = hex_str.hex

    if codepoint == 0 ||
        codepoint.between?(0xD800, 0xDFFF) ||
        codepoint > 0x10FFFF

      return "\ufffd"
    else
      return codepoint.chr(Encoding::UTF_8)
    end
  end

  @s.consume
end
consume_ident() click to toggle source

Consumes an ident-like token and returns it.

4.3.4. dev.w3.org/csswg/css-syntax/#consume-an-ident-like-token

# File lib/crass/tokenizer.rb, line 350
def consume_ident
  value = consume_name

  if @s.peek == '('
    @s.consume

    if value.downcase == 'url'
      @s.consume while @s.peek(2) =~ RE_WHITESPACE_ANCHORED

      if @s.peek(2) =~ RE_QUOTED_URL_START
        create_token(:function, :value => value)
      else
        consume_url
      end
    else
      create_token(:function, :value => value)
    end
  else
    create_token(:ident, :value => value)
  end
end
consume_name() click to toggle source

Consumes a name and returns it.

4.3.12. dev.w3.org/csswg/css-syntax/#consume-a-name

# File lib/crass/tokenizer.rb, line 375
def consume_name
  result = String.new

  until @s.eos?
    if match = @s.scan(RE_NAME)
      result << match
      next
    end

    char = @s.consume

    if valid_escape?
      result << consume_escaped

    # Non-standard: IE * hack
    elsif char == '*' && @options[:preserve_hacks]
      result << @s.consume

    else
      @s.reconsume
      return result
    end
  end

  result
end
consume_number() click to toggle source

Consumes a number and returns a 3-element array containing the number's original representation, its numeric value, and its type (either `:integer` or `:number`).

4.3.13. dev.w3.org/csswg/css-syntax/#consume-a-number

# File lib/crass/tokenizer.rb, line 407
def consume_number
  repr = String.new
  type = :integer

  repr << @s.consume if @s.peek =~ RE_NUMBER_SIGN
  repr << (@s.scan(RE_DIGIT) || '')

  if match = @s.scan(RE_NUMBER_DECIMAL)
    repr << match
    type = :number
  end

  if match = @s.scan(RE_NUMBER_EXPONENT)
    repr << match
    type = :number
  end

  [repr, convert_string_to_number(repr), type]
end
consume_numeric() click to toggle source

Consumes a numeric token and returns it.

4.3.3. dev.w3.org/csswg/css-syntax/#consume-a-numeric-token

# File lib/crass/tokenizer.rb, line 430
def consume_numeric
  number = consume_number
  repr = number[0]
  value = number[1]
  type = number[2]

  if type == :integer
    value = value.to_i
  else
    value = value.to_f
  end

  if start_identifier?(@s.peek(3))
    create_token(:dimension,
      :repr => repr,
      :type => type,
      :unit => consume_name,
      :value => value)

  elsif @s.peek == '%'
    @s.consume

    create_token(:percentage,
      :repr => repr,
      :type => type,
      :value => value)

  else
    create_token(:number,
      :repr => repr,
      :type => type,
      :value => value)
  end
end
consume_string(ending = nil) click to toggle source

Consumes a string token that ends at the given character, and returns the token.

4.3.5. dev.w3.org/csswg/css-syntax/#consume-a-string-token

# File lib/crass/tokenizer.rb, line 469
def consume_string(ending = nil)
  ending = @s.current if ending.nil?
  value  = String.new

  until @s.eos?
    case char = @s.consume
    when ending
      break

    when "\n"
      # Parse error.
      @s.reconsume
      return create_token(:bad_string,
        :error => true,
        :value => value)

    when '\\'
      case @s.peek
      when ''
        # End of the input, so do nothing.
        next

      when "\n"
        @s.consume

      else
        value << consume_escaped
      end

    else
      value << char
    end
  end

  create_token(:string, :value => value)
end
consume_unicode_range() click to toggle source

Consumes a Unicode range token and returns it. Assumes the initial “u+” or “U+” has already been consumed.

4.3.7. dev.w3.org/csswg/css-syntax/#consume-a-unicode-range-token

# File lib/crass/tokenizer.rb, line 510
def consume_unicode_range
  value = @s.scan(RE_HEX) || String.new

  while value.length < 6
    break unless @s.peek == '?'
    value << @s.consume
  end

  range = {}

  if value.include?('?')
    range[:start] = value.gsub('?', '0').hex
    range[:end]   = value.gsub('?', 'F').hex
    return create_token(:unicode_range, range)
  end

  range[:start] = value.hex

  if @s.peek(2) =~ RE_UNICODE_RANGE_END
    @s.consume
    range[:end] = (@s.scan(RE_HEX) || '').hex
  else
    range[:end] = range[:start]
  end

  create_token(:unicode_range, range)
end
consume_url() click to toggle source

Consumes a URL token and returns it. Assumes the original “url(” has already been consumed.

4.3.6. dev.w3.org/csswg/css-syntax/#consume-a-url-token

# File lib/crass/tokenizer.rb, line 542
def consume_url
  value = String.new

  @s.scan(RE_WHITESPACE)

  until @s.eos?
    case char = @s.consume
    when ')'
      break

    when RE_WHITESPACE
      @s.scan(RE_WHITESPACE)

      if @s.eos? || @s.peek == ')'
        @s.consume
        break
      else
        return create_token(:bad_url, :value => value + consume_bad_url)
      end

    when '"', "'", '(', RE_NON_PRINTABLE
      # Parse error.
      return create_token(:bad_url,
        :error => true,
        :value => value + consume_bad_url)

    when '\\'
      if valid_escape?
        value << consume_escaped
      else
        # Parse error.
        return create_token(:bad_url,
          :error => true,
          :value => value + consume_bad_url
        )
      end

    else
      value << char
    end
  end

  create_token(:url, :value => value)
end
convert_string_to_number(str) click to toggle source

Converts a valid CSS number string into a number and returns the number.

4.3.14. dev.w3.org/csswg/css-syntax/#convert-a-string-to-a-number

# File lib/crass/tokenizer.rb, line 590
def convert_string_to_number(str)
  matches = RE_NUMBER_STR.match(str)

  s = matches[:sign] == '-' ? -1 : 1
  i = matches[:integer].to_i
  f = matches[:fractional].to_i
  d = matches[:fractional] ? matches[:fractional].length : 0
  t = matches[:exponent_sign] == '-' ? -1 : 1
  e = matches[:exponent].to_i

  # I know this formula looks nutty, but it's exactly what's defined in the
  # spec, and it works.
  value = s * (i + f * 10**-d) * 10**(t * e)

  # Maximum and minimum values aren't defined in the spec, but are enforced
  # here for sanity.
  if value > Float::MAX
    value = Float::MAX
  elsif value < -Float::MAX
    value = -Float::MAX
  end

  value
end
create_token(type, properties = {}) click to toggle source

Creates and returns a new token with the given properties.

# File lib/crass/tokenizer.rb, line 616
def create_token(type, properties = {})
  {
    :node => type,
    :pos  => @s.marker,
    :raw  => @s.marked
  }.merge!(properties)
end
preprocess(input) click to toggle source

Preprocesses input to prepare it for the tokenizer.

3.3. dev.w3.org/csswg/css-syntax/#input-preprocessing

# File lib/crass/tokenizer.rb, line 627
def preprocess(input)
  input = input.to_s.encode('UTF-8',
    :invalid => :replace,
    :undef   => :replace)

  input.gsub!(/(?:\r\n|[\r\f])/, "\n")
  input.gsub!("\u0000", "\ufffd")
  input
end
start_identifier?(text = nil) click to toggle source

Returns `true` if the given three-character text would start an identifier. If text is `nil`, the current and next two characters in the input stream will be checked, but will not be consumed.

4.3.10. dev.w3.org/csswg/css-syntax/#would-start-an-identifier

# File lib/crass/tokenizer.rb, line 642
def start_identifier?(text = nil)
  text = @s.current + @s.peek(2) if text.nil?

  case text[0]
  when '-'
    nextChar = text[1]
    !!(nextChar == '-' || nextChar =~ RE_NAME_START || valid_escape?(text[1, 2]))

  when RE_NAME_START
    true

  when '\\'
    valid_escape?(text[0, 2])

  else
    false
  end
end
start_number?(text = nil) click to toggle source

Returns `true` if the given three-character text would start a number. If text is `nil`, the current and next two characters in the input stream will be checked, but will not be consumed.

4.3.11. dev.w3.org/csswg/css-syntax/#starts-with-a-number

# File lib/crass/tokenizer.rb, line 666
def start_number?(text = nil)
  text = @s.current + @s.peek(2) if text.nil?

  case text[0]
  when '+', '-'
    !!(text[1] =~ RE_DIGIT || (text[1] == '.' && text[2] =~ RE_DIGIT))

  when '.'
    !!(text[1] =~ RE_DIGIT)

  when RE_DIGIT
    true

  else
    false
  end
end
tokenize() click to toggle source

Tokenizes the input stream and returns an array of tokens.

# File lib/crass/tokenizer.rb, line 685
def tokenize
  @s.reset

  tokens = []

  while token = consume
    tokens << token
  end

  tokens
end
valid_escape?(text = nil) click to toggle source

Returns `true` if the given two-character text is the beginning of a valid escape sequence. If text is `nil`, the current and next character in the input stream will be checked, but will not be consumed.

4.3.9. dev.w3.org/csswg/css-syntax/#starts-with-a-valid-escape

# File lib/crass/tokenizer.rb, line 702
def valid_escape?(text = nil)
  text = @s.current + @s.peek if text.nil?
  !!(text[0] == '\\' && text[1] != "\n")
end