Class RubyLex
In: rdoc/parsers/parse_rb.rb
Parent: Object

Lexical analyzer for Ruby source

Methods

Included Modules

RubyToken

Classes and Modules

Class RubyLex::BufferedReader

Constants

ENINDENT_CLAUSE = [ "case", "class", "def", "do", "for", "if", "module", "unless", "until", "while", "begin"
DEINDENT_CLAUSE = ["end"
PERCENT_LTYPE = { "q" => "\'", "Q" => "\"", "x" => "\`", "r" => "\/", "w" => "]"
PERCENT_PAREN = { "{" => "}", "[" => "]", "<" => ">", "(" => ")"
Ltype2Token = { "\'" => TkSTRING, "\"" => TkSTRING, "\`" => TkXSTRING, "\/" => TkREGEXP, "]" => TkDSTRING
DLtype2Token = { "\"" => TkDSTRING, "\`" => TkDXSTRING, "\/" => TkDREGEXP, }

Attributes

continue  [R] 
exception_on_syntax_error  [RW] 
indent  [R] 
lex_state  [R] 
read_auto_clean_up  [RW] 
skip_space  [RW] 

Public Class methods

[Source]

# File rdoc/parsers/parse_rb.rb, line 443
  def RubyLex.debug?
    false
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 447
  def initialize(content)
    lex_init

    @reader = BufferedReader.new(content)

    @exp_line_no = @line_no = 1
    @base_char_no = 0
    @indent = 0

    @ltype = nil
    @quoted = nil
    @lex_state = EXPR_BEG
    @space_seen = false
    
    @continue = false
    @line = ""

    @skip_space = false
    @read_auto_clean_up = false
    @exception_on_syntax_error = true
  end

Public Instance methods

[Source]

# File rdoc/parsers/parse_rb.rb, line 480
  def char_no
    @reader.column
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 484
  def get_read
    @reader.get_read
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 488
  def getc
    @reader.getc
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 492
  def getc_of_rests
    @reader.getc_already_read
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 496
  def gets
    c = getc or return
    l = ""
    begin
      l.concat c unless c == "\r"
      break if c == "\n"
    end while c = getc
    l
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 1269
  def identify_comment
    @ltype = "#"
    comment = "#"
    while ch = getc
      if ch == "\\"
        ch = getc
        if ch == "\n"
          ch = " "
        else
          comment << "\\" 
        end
      else
        if ch == "\n"
          @ltype = nil
          ungetc
          break
        end
      end
      comment << ch
    end
    return Token(TkCOMMENT).set_text(comment)
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 964
  def identify_gvar
    @lex_state = EXPR_END
    str = "$"

    tk = case ch = getc
         when /[~_*$?!@\/\\;,=:<>".]/   #"
           str << ch
           Token(TkGVAR, str)
           
         when "-"
           str << "-" << getc
           Token(TkGVAR, str)
           
         when "&", "`", "'", "+"
           str << ch
           Token(TkBACK_REF, str)
           
         when /[1-9]/
           str << ch
           while (ch = getc) =~ /[0-9]/
             str << ch
           end
           ungetc
           Token(TkNTH_REF)
         when /\w/
           ungetc
           ungetc
           return identify_identifier
         else 
           ungetc
           Token("$")     
         end
    tk.set_text(str)
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 1074
  def identify_here_document
    ch = getc
    if ch == "-"
      ch = getc
      indent = true
    end
    if /['"`]/ =~ ch            # '
      lt = ch
      quoted = ""
      while (c = getc) && c != lt
        quoted.concat c
      end
    else
      lt = '"'
      quoted = ch.dup
      while (c = getc) && c =~ /\w/
        quoted.concat c
      end
      ungetc
    end

    ltback, @ltype = @ltype, lt
    reserve = ""

    while ch = getc
      reserve << ch
      if ch == "\\"    #"
        ch = getc
        reserve << ch
      elsif ch == "\n"
        break
      end
    end

    str = ""
    while (l = gets)
      l.chomp!
      l.strip! if indent
      break if l == quoted
      str << l.chomp << "\n"
    end

    @reader.divert_read_from(reserve)

    @ltype = ltback
    @lex_state = EXPR_END
    Token(Ltype2Token[lt], str).set_text(str.dump)
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 999
  def identify_identifier
    token = ""
    token.concat getc if peek(0) =~ /[$@]/
    token.concat getc if peek(0) == "@"

    while (ch = getc) =~ /\w|_/
      print ":", ch, ":" if RubyLex.debug?
      token.concat ch
    end
    ungetc
    
    if ch == "!" or ch == "?"
      token.concat getc
    end
    # fix token

    # $stderr.puts "identifier - #{token}, state = #@lex_state"

    case token
    when /^\$/
      return Token(TkGVAR, token).set_text(token)
    when /^\@/
      @lex_state = EXPR_END
      return Token(TkIVAR, token).set_text(token)
    end
    
    if @lex_state != EXPR_DOT
      print token, "\n" if RubyLex.debug?

      token_c, *trans = TkReading2Token[token]
      if token_c
        # reserved word?

        if (@lex_state != EXPR_BEG &&
            @lex_state != EXPR_FNAME &&
            trans[1])
          # modifiers
          token_c = TkSymbol2Token[trans[1]]
          @lex_state = trans[0]
        else
          if @lex_state != EXPR_FNAME
            if ENINDENT_CLAUSE.include?(token)
              @indent += 1
            elsif DEINDENT_CLAUSE.include?(token)
              @indent -= 1
            end
            @lex_state = trans[0]
          else
            @lex_state = EXPR_END
          end
        end
        return Token(token_c, token).set_text(token)
      end
    end

    if @lex_state == EXPR_FNAME
      @lex_state = EXPR_END
      if peek(0) == '='
        token.concat getc
      end
    elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT
      @lex_state = EXPR_ARG
    else
      @lex_state = EXPR_END
    end

    if token[0, 1] =~ /[A-Z]/
      return Token(TkCONSTANT, token).set_text(token)
    elsif token[token.size - 1, 1] =~ /[!?]/
      return Token(TkFID, token).set_text(token)
    else
      return Token(TkIDENTIFIER, token).set_text(token)
    end
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 1142
  def identify_number(start)
    str = start.dup

    if start == "+" or start == "-" or start == ""
      start = getc
      str << start
    end

    @lex_state = EXPR_END

    if start == "0"
      if peek(0) == "x"
        ch = getc
        str << ch
        match = /[0-9a-f_]/
      else
        match = /[0-7_]/
      end
      while ch = getc
        if ch !~ match
          ungetc
          break
        else
          str << ch
        end
      end
      return Token(TkINTEGER).set_text(str)
    end

    type = TkINTEGER
    allow_point = TRUE
    allow_e = TRUE
    while ch = getc
      case ch
      when /[0-9_]/
        str << ch

      when allow_point && "."
        type = TkFLOAT
        if peek(0) !~ /[0-9]/
          ungetc
          break
        end
        str << ch
        allow_point = false

      when allow_e && "e", allow_e && "E"
        str << ch
        type = TkFLOAT
        if peek(0) =~ /[+-]/
          str << getc
        end
        allow_e = false
        allow_point = false
      else
        ungetc
        break
      end
    end
    Token(type).set_text(str)
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 1123
  def identify_quotation(initial_char)
    ch = getc
    if lt = PERCENT_LTYPE[ch]
      initial_char += ch
      ch = getc
    elsif ch =~ /\W/
      lt = "\""
    else
      RubyLex.fail SyntaxError, "unknown type of %string ('#{ch}')"
    end
#     if ch !~ /\W/
#       ungetc
#       next
#     end
    #@ltype = lt
    @quoted = ch unless @quoted = PERCENT_PAREN[ch]
    identify_string(lt, @quoted, ch, initial_char)
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 1204
  def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil)
    @ltype = ltype
    @quoted = quoted
    subtype = nil

    str = ""
    str << initial_char if initial_char
    str << (opener||quoted)

    nest = 0
    begin
      while ch = getc 
        str << ch
        if @quoted == ch 
          if nest == 0
            break
          else
            nest -= 1
          end
        elsif opener == ch
          nest += 1
        elsif @ltype != "'" && @ltype != "]" and ch == "#"
          ch = getc
          if ch == "{"
            subtype = true
            str << ch << skip_inner_expression
          else
            ungetc(ch)
          end
        elsif ch == '\\' #'
          str << read_escape
        end
      end
      if @ltype == "/"
        if peek(0) =~ /i|o|n|e|s/
          str << getc
        end
      end
      if subtype
        Token(DLtype2Token[ltype], str)
      else
        Token(Ltype2Token[ltype], str)
      end.set_text(str)
    ensure
      @ltype = nil
      @quoted = nil
      @lex_state = EXPR_END
    end
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 519
  def lex
    until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
             !@continue or
             tk.nil?)
    end
    line = get_read

    if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
      nil
    else
      line
    end
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 589
  def lex_init()
    @OP = SLex.new
    @OP.def_rules("\0", "\004", "\032") do |chars, io|
      Token(TkEND_OF_SCRIPT).set_text(chars)
    end

    @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io|
      @space_seen = TRUE
      while (ch = getc) =~ /[ \t\f\r\13]/
        chars << ch
      end
      ungetc
      Token(TkSPACE).set_text(chars)
    end

    @OP.def_rule("#") do
      |op, io|
      identify_comment
    end

    @OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do
      |op, io|
      str = op
      @ltype = "="


      begin
        line = ""
        begin
          ch = getc
          line << ch
        end until ch == "\n"
        str << line
      end until line =~ /^=end/

      ungetc

      @ltype = nil

      if str =~ /\A=begin\s+rdoc/i
        str.sub!(/\A=begin.*\n/, '')
        str.sub!(/^=end.*/m, '')
        Token(TkCOMMENT).set_text(str)
      else
        Token(TkRD_COMMENT)#.set_text(str)
      end
    end

    @OP.def_rule("\n") do
      print "\\n\n" if RubyLex.debug?
      case @lex_state
      when EXPR_BEG, EXPR_FNAME, EXPR_DOT
        @continue = TRUE
      else
        @continue = FALSE
        @lex_state = EXPR_BEG
      end
      Token(TkNL).set_text("\n")
    end

    @OP.def_rules("*", "**",    
                  "!", "!=", "!~",
                  "=", "==", "===", 
                  "=~", "<=>",        
                  "<", "<=",
                  ">", ">=", ">>") do
      |op, io|
      @lex_state = EXPR_BEG
      Token(op).set_text(op)
    end

    @OP.def_rules("<<") do
      |op, io|
      tk = nil
      if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
          (@lex_state != EXPR_ARG || @space_seen)
        c = peek(0)
        if /[-\w_\"\'\`]/ =~ c
          tk = identify_here_document
        end
      end
      if !tk
        @lex_state = EXPR_BEG
        tk = Token(op).set_text(op)
      end
      tk
    end

    @OP.def_rules("'", '"') do
      |op, io|
      identify_string(op)
    end

    @OP.def_rules("`") do
      |op, io|
      if @lex_state == EXPR_FNAME
        Token(op).set_text(op)
      else
        identify_string(op)
      end
    end

    @OP.def_rules('?') do
      |op, io|
      if @lex_state == EXPR_END
        @lex_state = EXPR_BEG
        Token(TkQUESTION).set_text(op)
      else
        ch = getc
        if @lex_state == EXPR_ARG && ch !~ /\s/
          ungetc
          @lex_state = EXPR_BEG;
          Token(TkQUESTION).set_text(op)
        else
          str = op
          str << ch
          if (ch == '\\') #'
            str << read_escape
          end
          @lex_state = EXPR_END
          Token(TkINTEGER).set_text(str)
        end
      end
    end

    @OP.def_rules("&", "&&", "|", "||") do
      |op, io|
      @lex_state = EXPR_BEG
      Token(op).set_text(op)
    end
    
    @OP.def_rules("+=", "-=", "*=", "**=", 
                  "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
      |op, io|
      @lex_state = EXPR_BEG
      op =~ /^(.*)=$/
      Token(TkOPASGN, $1).set_text(op)
    end

    @OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io|
      Token(TkUPLUS).set_text(op)
    end

    @OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io|
      Token(TkUMINUS).set_text(op)
    end

    @OP.def_rules("+", "-") do
      |op, io|
      catch(:RET) do
        if @lex_state == EXPR_ARG
          if @space_seen and peek(0) =~ /[0-9]/
            throw :RET, identify_number(op)
          else
            @lex_state = EXPR_BEG
          end
        elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
          throw :RET, identify_number(op)
        else
          @lex_state = EXPR_BEG
        end
        Token(op).set_text(op)
      end
    end

    @OP.def_rule(".") do
      @lex_state = EXPR_BEG
      if peek(0) =~ /[0-9]/
        ungetc
        identify_number("")
      else
        # for obj.if
        @lex_state = EXPR_DOT
        Token(TkDOT).set_text(".")
      end
    end

    @OP.def_rules("..", "...") do
      |op, io|
      @lex_state = EXPR_BEG
      Token(op).set_text(op)
    end

    lex_int2
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 775
  def lex_int2
    @OP.def_rules("]", "}", ")") do
      |op, io|
      @lex_state = EXPR_END
      @indent -= 1
      Token(op).set_text(op)
    end

    @OP.def_rule(":") do
      if @lex_state == EXPR_END || peek(0) =~ /\s/
        @lex_state = EXPR_BEG
        tk = Token(TkCOLON)
      else
        @lex_state = EXPR_FNAME;
        tk = Token(TkSYMBEG)
      end
      tk.set_text(":")
    end

    @OP.def_rule("::") do
#      p @lex_state.id2name, @space_seen
      if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
        @lex_state = EXPR_BEG
        tk = Token(TkCOLON3)
      else
        @lex_state = EXPR_DOT
        tk = Token(TkCOLON2)
      end
      tk.set_text("::")
    end

    @OP.def_rule("/") do
      |op, io|
      if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
        identify_string(op)
      elsif peek(0) == '='
        getc
        @lex_state = EXPR_BEG
        Token(TkOPASGN, :/).set_text("/=") #")
      elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
        identify_string(op)
      else 
        @lex_state = EXPR_BEG
        Token("/").set_text(op)
      end
    end

    @OP.def_rules("^") do
      @lex_state = EXPR_BEG
      Token("^").set_text("^")
    end

    #       @OP.def_rules("^=") do
    #   @lex_state = EXPR_BEG
    #   Token(TkOPASGN, :^)
    #       end
    
    @OP.def_rules(",", ";") do
      |op, io|
      @lex_state = EXPR_BEG
      Token(op).set_text(op)
    end

    @OP.def_rule("~") do
      @lex_state = EXPR_BEG
      Token("~").set_text("~")
    end

    @OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do
      @lex_state = EXPR_BEG
      Token("~").set_text("~@")
    end
    
    @OP.def_rule("(") do
      @indent += 1
      if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
        @lex_state = EXPR_BEG
        tk = Token(TkfLPAREN)
      else
        @lex_state = EXPR_BEG
        tk = Token(TkLPAREN)
      end
      tk.set_text("(")
    end

    @OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do
      Token("[]").set_text("[]")
    end

    @OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do
      Token("[]=").set_text("[]=")
    end

    @OP.def_rule("[") do
      @indent += 1
      if @lex_state == EXPR_FNAME
        t = Token(TkfLBRACK)
      else
        if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
          t = Token(TkLBRACK)
        elsif @lex_state == EXPR_ARG && @space_seen
          t = Token(TkLBRACK)
        else
          t = Token(TkfLBRACK)
        end
        @lex_state = EXPR_BEG
      end
      t.set_text("[")
    end

    @OP.def_rule("{") do
      @indent += 1
      if @lex_state != EXPR_END && @lex_state != EXPR_ARG
        t = Token(TkLBRACE)
      else
        t = Token(TkfLBRACE)
      end
      @lex_state = EXPR_BEG
      t.set_text("{")
    end

    @OP.def_rule('\\') do   #'
      if getc == "\n" 
        @space_seen = true
        @continue = true
        Token(TkSPACE).set_text("\\\n")
      else 
        ungetc
        Token("\\").set_text("\\")  #"
      end 
    end 

    @OP.def_rule('%') do
      |op, io|
      if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
        identify_quotation('%')
      elsif peek(0) == '='
        getc
        Token(TkOPASGN, "%").set_text("%=")
      elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
        identify_quotation('%')
      else
        @lex_state = EXPR_BEG
        Token("%").set_text("%")
      end
    end

    @OP.def_rule('$') do  #'
      identify_gvar
    end

    @OP.def_rule('@') do
      if peek(0) =~ /[@\w_]/
        ungetc
        identify_identifier
      else
        Token("@").set_text("@")
      end
    end

    #       @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do 
    #   |op, io|
    #   @indent += 1
    #   @lex_state = EXPR_FNAME
    # # @lex_state = EXPR_END
    # # until @rests[0] == "\n" or @rests[0] == ";"
    # #   rests.shift
    # # end
    #       end

    @OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do
      throw :eof
    end

    @OP.def_rule("") do
      |op, io|
      printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug?
      if peek(0) =~ /[0-9]/
        t = identify_number("")
      elsif peek(0) =~ /[\w_]/
        t = identify_identifier
      end
      printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug?
      t
    end
    
    p @OP if RubyLex.debug?
  end

io functions

[Source]

# File rdoc/parsers/parse_rb.rb, line 476
  def line_no
    @reader.line_num
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 515
  def peek(i = 0)
    @reader.peek(i)
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 511
  def peek_equal?(str)
    @reader.peek_equal(str)
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 1292
  def read_escape
    res = ""
    case ch = getc
    when /[0-7]/
      ungetc ch
      3.times do
        case ch = getc
        when /[0-7]/
        when nil
          break
        else
          ungetc
          break
        end
        res << ch
      end
      
    when "x"
      res << ch
      2.times do
        case ch = getc
        when /[0-9a-fA-F]/
        when nil
          break
        else
          ungetc
          break
        end
        res << ch
      end

    when "M"
      res << ch
      if (ch = getc) != '-'
        ungetc
      else
        res << ch
        if (ch = getc) == "\\" #"
          res << ch
          res << read_escape
        else
          res << ch
        end
      end

    when "C", "c", "^"
      res << ch
      if ch == "C" and (ch = getc) != "-"
        ungetc
      else
        res << ch
        if (ch = getc) == "\\" #"
          res << ch
          res << read_escape
        else
          res << ch
        end
      end
    else
      res << ch
    end
    res
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 1254
  def skip_inner_expression
    res = ""
    nest = 0
    while (ch = getc)
      res << ch
      if ch == '}'
        break if nest.zero?
        nest -= 1
      elsif ch == '{'
        nest += 1
      end
    end
    res
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 533
  def token
    set_token_position(line_no, char_no)
    begin
      begin
        tk = @OP.match(self)
        @space_seen = tk.kind_of?(TkSPACE)
      rescue SyntaxError
        abort if @exception_on_syntax_error
        tk = TkError.new(line_no, char_no)
      end
    end while @skip_space and tk.kind_of?(TkSPACE)
    if @read_auto_clean_up
      get_read
    end
#   throw :eof unless tk
    p tk if $DEBUG
    tk
  end

[Source]

# File rdoc/parsers/parse_rb.rb, line 507
  def ungetc(c = nil)
    @reader.ungetc(c)
  end

[Validate]