Added Parser

- can parse a basic token list to an AST
This commit is contained in:
Clemens-Dautermann 2020-08-17 01:07:59 +02:00
parent ea26acce4a
commit b3fe78fffb
19 changed files with 451 additions and 88 deletions

100
Compiler/Lexer/Lexer.cs Normal file
View file

@ -0,0 +1,100 @@
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace Compiler.Lexer
{
class Lexer
{
//this method transforms a String into a list of tokens
public List<Token> Lex(String inputString)
{
//strip newlines
inputString = inputString.Replace("\n", "");
inputString = inputString.Replace("\r", "");
//initialize Token List
List<Token> tokens = new List<Token>();
//while the string is not empty
while (inputString.Length > 0)
{
//trim whitespace
inputString = inputString.Trim();
//get next token
Token t = Next(inputString);
//append token to list
tokens.Add(t);
//remove token from input string
inputString = inputString.Substring(t.Length);
}
return tokens;
}
//this function returns the next token from a string
private Token Next(String input)
{
//initialize different possible patterns
List<Pattern> patterns = new List<Pattern>();
patterns.Add(new Pattern(@"^\(", TokenType.OpenParenthesisToken));
patterns.Add(new Pattern(@"^\)", TokenType.CloseParenthesisToken));
patterns.Add(new Pattern(@"^int ", TokenType.IntToken));
patterns.Add(new Pattern(@"^}", TokenType.CloseBraceToken));
patterns.Add(new Pattern(@"^{", TokenType.OpenBraceToken));
patterns.Add(new Pattern(@"^return ", TokenType.ReturnToken));
patterns.Add(new Pattern(@"^;", TokenType.SemicolonToken));
patterns.Add(new Pattern(@"^[a-zA-Z]\w*", TokenType.IdentifierToken));
patterns.Add(new Pattern(@"^[0-9]+", TokenType.IntegerLiteralToken));
//try each pattern do determine if it is the one matching at the beginning
//TODO: There sure is room for optimization here
foreach (var pattern in patterns)
{
//get regex from pattern and match
Regex r = pattern.ToRegex();
Match m = r.Match(input);
//check if the match was successful and at the beginning of the string
if (m.Index == 0 && m.Success)
{
//generate new token from match
Token t = new Token(pattern.GetTokenType());
t.Length = m.Length;
t.Value = null;
//switch over token types to assign a value if needed
//the default case "InvalidToken" is here for security and should never happen. If it does,
//something is wrong with the regex
switch (pattern.GetTokenType())
{
case TokenType.IntegerLiteralToken:
//TODO: Error handling is missing here
t.Value = Int32.Parse(m.Value);
break;
case TokenType.IdentifierToken:
t.Value = m.Value;
break;
case TokenType.ReturnToken:
case TokenType.CloseParenthesisToken:
case TokenType.OpenParenthesisToken:
case TokenType.OpenBraceToken:
case TokenType.CloseBraceToken:
case TokenType.IntToken:
case TokenType.SemicolonToken:
break;
default:
t.TokenType = TokenType.InvalidToken;
throw new Exception("Match found, but no corresponding token. Check regex!");
}
return t;
}
}
//return the next token
return new Token(TokenType.InvalidToken);
}
}
}

27
Compiler/Lexer/Pattern.cs Normal file
View file

@ -0,0 +1,27 @@
using System;
using System.Text.RegularExpressions;
namespace Compiler.Lexer
{
public class Pattern
{
private String pattern;
private TokenType tokenType;
public Pattern(String pattern, TokenType tokenType)
{
this.pattern = pattern;
this.tokenType = tokenType;
}
public Regex ToRegex()
{
return new Regex(this.pattern);
}
public TokenType GetTokenType()
{
return this.tokenType;
}
}
}

29
Compiler/Lexer/Token.cs Normal file
View file

@ -0,0 +1,29 @@
using System;
using System.Linq.Expressions;
namespace Compiler.Lexer
{
public class Token
{
public TokenType TokenType { get; set; }
public Object Value { get; set; }
public int Length { get; set; }
public Token(TokenType pTokenType)
{
this.TokenType = pTokenType;
}
public override string ToString()
{
if (Value == null)
{
return TokenType.ToString();
}
else
{
return TokenType.ToString() + ":" + Value.ToString();
}
}
}
}

View file

@ -0,0 +1,19 @@
namespace Compiler.Lexer
{
public enum TokenType
{
OpenParenthesisToken,
CloseParenthesisToken,
IntToken,
OpenBraceToken,
CloseBraceToken,
ReturnToken,
SemicolonToken,
IdentifierToken,
IntegerLiteralToken,
//special Token to represent invalid matches
InvalidToken
}
}