diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..ccaee7e Binary files /dev/null and b/.DS_Store differ diff --git a/.clang-format b/.clang-format index e7dbe41..6433a09 100644 --- a/.clang-format +++ b/.clang-format @@ -1,28 +1,28 @@ ---- -Language: Cpp -BasedOnStyle: Google - -# Basic Formatting -IndentWidth: 4 -ColumnLimit: 100 -TabWidth: 4 -UseTab: Never - -# Braces and Spacing -BreakBeforeBraces: Attach -AllowShortBlocksOnASingleLine: Empty -AllowShortFunctionsOnASingleLine: Empty -AllowShortIfStatementsOnASingleLine: Never -AllowShortLoopsOnASingleLine: false - -# Pointers and References (e.g., int* ptr instead of int *ptr) -PointerAlignment: Left - -# Alignment for readability -AlignConsecutiveAssignments: true -AlignConsecutiveDeclarations: false -AlignOperands: Align -AlignTrailingComments: true - -# Includes -SortIncludes: true +--- +Language: Cpp +BasedOnStyle: Google + +# Basic Formatting +IndentWidth: 4 +ColumnLimit: 100 +TabWidth: 4 +UseTab: Never + +# Braces and Spacing +BreakBeforeBraces: Attach +AllowShortBlocksOnASingleLine: Empty +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false + +# Pointers and References (e.g., int* ptr instead of int *ptr) +PointerAlignment: Left + +# Alignment for readability +AlignConsecutiveAssignments: true +AlignConsecutiveDeclarations: false +AlignOperands: Align +AlignTrailingComments: true + +# Includes +SortIncludes: true diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 4d16aa6..c40fd60 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -1,25 +1,25 @@ -name: C/C++ CI - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Install clang-format - run: | - sudo apt-get update - sudo apt-get install -y clang-format - - - name: Check Formatting - run: make check-format - - - name: Build project - run: make +name: C/C++ CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install clang-format + run: | + sudo apt-get update + sudo apt-get install -y clang-format + + - name: Check Formatting + run: make check-format + + - name: Build project + run: make diff --git a/.gitignore b/.gitignore index 80a6dbe..1722d8e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,44 +1,47 @@ -#build directory -build/ - -# Prerequisites -*.d - -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Linker files -*.ilk - -# Debugger Files -*.pdb - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Executables -*.exe -*.out -*.app - -# debug information files -*.dwo +#build directory +build/ + +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Linker files +*.ilk + +# Debugger Files +*.pdb + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# debug information files +*.dwo + +# vscode +.vscode/ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..64d7619 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "makefile.configureOnOpen": true +} diff --git a/LICENSE b/LICENSE index 17601d9..85e3539 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2026 Sujal Kumar - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2026 Sujal Kumar + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Lexer/LexerClasses - Part 1.cpp b/Lexer/LexerClasses - Part 1.cpp index d7c1ae9..05f9a39 100644 --- a/Lexer/LexerClasses - Part 1.cpp +++ b/Lexer/LexerClasses - Part 1.cpp @@ -1,144 +1,144 @@ -#include -#include -#include - -enum class TokenType { - DEF, - RETURN, - IF, - ELSE, - PRINT, - FOR, - WHILE, - - IDENTIFIER, - NUMBER, - STRING, - - PLUS, // + - MINUS, // - - STAR, // * - SLASH, // / - ASSIGN, // = - MODULO, // % - GREATERTHAN, // > - LESSERTHAN, // < - - LPAREN, // ( - RPAREN, // ) - - NEWLINE, - EOF_TOKEN, - - COMMA, // , - COLON, // : - INDENT, - DEDENT, - COMMENT // # -}; - - -class Token{ - public: - // Main part of a token - TokenType type; - std::string value; - - // Used to tell user about error in case error is found - int line; - int column; - - // This constructor will allow us to easily make the tokens while coding - Token(TokenType type, std::string val, int l, int c){ - this->type = type; this->value = val; - this->line = l; this->column = c; - } -}; - - -class Lexer{ - public: - Lexer(std::string input_string){ - this->source_code = preprocess_indents(input_string); - } - std::vector scan_Tokens(); - - private: - std::string source_code; - static inline const std::unordered_map keywords = { - {"def", TokenType::DEF}, - {"return", TokenType::RETURN}, - {"if", TokenType::IF}, - {"else", TokenType::ELSE}, - {"print", TokenType::PRINT}, - {"=", TokenType::ASSIGN}, - {"+", TokenType::PLUS}, - {"-", TokenType::MINUS}, - {"*", TokenType::STAR}, - {"/", TokenType::SLASH}, - {"(", TokenType::LPAREN}, - {")", TokenType::RPAREN}, - {":", TokenType::COLON}, - {",", TokenType::COMMA}, - {"\n", TokenType::NEWLINE}, - {"\t", TokenType::INDENT}, - {"\r", TokenType::DEDENT} - }; - std::vector tokens; - int start = 0; - int current_index = 0; - int line = 1; - - - // FUNCTIONS NEEDED FOR LEXER TO WORK - - bool isAtEnd(); // Checks for last character - char advance(); // Return current char and move forward - char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it - char peekNext(); // Peak at the next character - - void addToken(TokenType type); - - // Specific scanners for complex types - void scanString(); - void scanNumber(); - void scanIdentifier(); - std::string preprocess_indents(std::string raw); -}; - - -std::string Lexer::preprocess_indents(std::string raw) { - std::string clean_code = ""; - bool atLineStart = true; - - for (int i = 0; i < raw.length(); i++) { - if (atLineStart) { - if (raw[i] == ' ') { - if (i + 3 < raw.length() && raw.substr(i, 4) == " ") { - clean_code += '\t'; - i += 3; - continue; - } - } - else if (raw[i] == '\t') { - clean_code += '\t'; - continue; - } - else if (raw[i] == '\n') { - clean_code += '\n'; - atLineStart = true; - continue; - } - else { - atLineStart = false; - } - } - clean_code += raw[i]; - - if (raw[i] == '\n') { - atLineStart = true; - } - } - - return clean_code; +#include +#include +#include + +enum class TokenType { + DEF, + RETURN, + IF, + ELSE, + PRINT, + FOR, + WHILE, + + IDENTIFIER, + NUMBER, + STRING, + + PLUS, // + + MINUS, // - + STAR, // * + SLASH, // / + ASSIGN, // = + MODULO, // % + GREATERTHAN, // > + LESSERTHAN, // < + + LPAREN, // ( + RPAREN, // ) + + NEWLINE, + EOF_TOKEN, + + COMMA, // , + COLON, // : + INDENT, + DEDENT, + COMMENT // # +}; + + +class Token{ + public: + // Main part of a token + TokenType type; + std::string value; + + // Used to tell user about error in case error is found + int line; + int column; + + // This constructor will allow us to easily make the tokens while coding + Token(TokenType type, std::string val, int l, int c){ + this->type = type; this->value = val; + this->line = l; this->column = c; + } +}; + + +class Lexer{ + public: + Lexer(std::string input_string){ + this->source_code = preprocess_indents(input_string); + } + std::vector scan_Tokens(); + + private: + std::string source_code; + static inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"if", TokenType::IF}, + {"else", TokenType::ELSE}, + {"print", TokenType::PRINT}, + {"=", TokenType::ASSIGN}, + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {":", TokenType::COLON}, + {",", TokenType::COMMA}, + {"\n", TokenType::NEWLINE}, + {"\t", TokenType::INDENT}, + {"\r", TokenType::DEDENT} + }; + std::vector tokens; + int start = 0; + int current_index = 0; + int line = 1; + + + // FUNCTIONS NEEDED FOR LEXER TO WORK + + bool isAtEnd(); // Checks for last character + char advance(); // Return current char and move forward + char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext(); // Peak at the next character + + void addToken(TokenType type); + + // Specific scanners for complex types + void scanString(); + void scanNumber(); + void scanIdentifier(); + std::string preprocess_indents(std::string raw); +}; + + +std::string Lexer::preprocess_indents(std::string raw) { + std::string clean_code = ""; + bool atLineStart = true; + + for (int i = 0; i < raw.length(); i++) { + if (atLineStart) { + if (raw[i] == ' ') { + if (i + 3 < raw.length() && raw.substr(i, 4) == " ") { + clean_code += '\t'; + i += 3; + continue; + } + } + else if (raw[i] == '\t') { + clean_code += '\t'; + continue; + } + else if (raw[i] == '\n') { + clean_code += '\n'; + atLineStart = true; + continue; + } + else { + atLineStart = false; + } + } + clean_code += raw[i]; + + if (raw[i] == '\n') { + atLineStart = true; + } + } + + return clean_code; } \ No newline at end of file diff --git a/Makefile b/Makefile index 8c8b5f8..10d66db 100644 --- a/Makefile +++ b/Makefile @@ -1,47 +1,80 @@ -#vars -TARGET_EXEC := executable -BUILD_DIR := build -SRC_DIR := src - -SRCS := $(shell find $(SRC_DIR) -name '*.cpp') -OBJS := $(SRCS:$(SRC_DIR)/%.cpp=$(BUILD_DIR)/%.o) -DEPS := $(OBJS:.o=.d) -INCS := $(shell find $(SRC_DIR) -type d) - -INC_FLAGS := $(addprefix -I,$(INCS)) -CPPFLAGS := $(INC_FLAGS) -MMD -MP -LDFLAGS := - -CXX := g++ - -#all -all: $(BUILD_DIR)/$(TARGET_EXEC) - -#executable dependencies -$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS) - @echo "Linking" - mkdir -p $(BUILD_DIR) - $(CXX) $(OBJS) -o $@ $(LDFLAGS) - -#object dependencies -$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp - @echo "Building dependencies" - mkdir -p $(dir $@) - $(CXX) $(CPPFLAGS) -c $< -o $@ - @echo - -#for change in header files --include $(DEPS) - -.PHONY: all clean run - -run: all - ./$(BUILD_DIR)/$(TARGET_EXEC) - -clean: - rm -rf $(BUILD_DIR) -format: - find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format -i - -check-format: - find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format --dry-run --Werror +#vars +TARGET_EXEC := executable +BUILD_DIR := build +SRC_DIR := src +TEST_DIR := tests + +SRCS := $(shell find $(SRC_DIR) -name '*.cpp') +OBJS := $(SRCS:$(SRC_DIR)/%.cpp=$(BUILD_DIR)/%.o) +DEPS := $(OBJS:.o=.d) +INCS := $(shell find $(SRC_DIR) -type d) + +LEXER_TEST := $(BUILD_DIR)/$(TEST_DIR)/lexer_test +PARSER_TEST := $(BUILD_DIR)/$(TEST_DIR)/parser_test +EVALUATOR_TEST := $(BUILD_DIR)/$(TEST_DIR)/evaluator_test +TEST_DEPS := $(LEXER_TEST).d $(PARSER_TEST).d + +INC_FLAGS := $(addprefix -I,$(INCS)) +CPPFLAGS := $(INC_FLAGS) -MMD -MP +LDFLAGS := + +CXX := g++ + +#all +all: $(BUILD_DIR)/$(TARGET_EXEC) + +#executable dependencies +$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS) + @echo "Linking" + mkdir -p $(BUILD_DIR) + $(CXX) $(OBJS) -o $@ $(LDFLAGS) + +#object dependencies +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp + @echo "Building dependencies" + mkdir -p $(dir $@) + $(CXX) $(CPPFLAGS) -c $< -o $@ + @echo + +#for change in header files +-include $(DEPS) +-include $(TEST_DEPS) + +.PHONY: all clean run + +run: all + ./$(BUILD_DIR)/$(TARGET_EXEC) + +clean: + rm -rf $(BUILD_DIR) +format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format -i + +check-format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format --dry-run --Werror + + +# separate tests for lexer, parser, evaluator +test-lexer: $(LEXER_TEST) + ./$(LEXER_TEST) + +test-parser: $(PARSER_TEST) + ./$(PARSER_TEST) + +test-evaluator: $(EVALUATOR_TEST) + ./$(EVALUATOR_TEST) + +# macro for executable and object dependencies for tests +define TEST_RULE +$(1): $(OBJS) $(1).o + mkdir -p $(BUILD_DIR)/$(TEST_DIR) + $(CXX) $(CPPFLAGS) $$^ -o $$@ + +$(1).o: $(TEST_DIR)/$(notdir $(1)).cpp + mkdir -p $(BUILD_DIR)/$(TEST_DIR) + $(CXX) $(CPPFLAGS) -c $$< -o $$@ +endef + +$(eval $(call TEST_RULE,$(LEXER_TEST))) +$(eval $(call TEST_RULE,$(PARSER_TEST))) +$(eval $(call TEST_RULE,$(EVALUATOR_TEST))) diff --git a/README.md b/README.md index ca0d79c..9a5e077 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# pydotcpp -Will add later +# pydotcpp +Will add later diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..f7b50e2 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/common/.DS_Store b/src/common/.DS_Store new file mode 100644 index 0000000..21b0671 Binary files /dev/null and b/src/common/.DS_Store differ diff --git a/src/common/token/token.cpp b/src/common/token/token.cpp new file mode 100644 index 0000000..02d1ee6 --- /dev/null +++ b/src/common/token/token.cpp @@ -0,0 +1,5 @@ +#include +#include + +Token::Token(TokenType type, std::string val, int l, int c) + : type(type), value(val), line(l), column(c) {} diff --git a/src/common/token/token.hpp b/src/common/token/token.hpp index 7489d7a..6af23e8 100644 --- a/src/common/token/token.hpp +++ b/src/common/token/token.hpp @@ -1,57 +1,84 @@ -#pragma once - -#include - -enum class TokenType { - DEF, - RETURN, - IF, - ELSE, - FOR, - WHILE, - - IDENTIFIER, - NUMBER, - STRING, - - PLUS, // + - MINUS, // - - STAR, // * - SLASH, // / - ASSIGN, // = - MODULO, // % - GREATERTHAN, // > - LESSERTHAN, // < - - LPAREN, // ( - RPAREN, // ) - - NEWLINE, - EOF_TOKEN, - - COMMA, // , - COLON, // : - INDENT, - DEDENT, - COMMENT // # -}; - - -class Token { -public: - // Main part of a token - TokenType type; - std::string value; - - // Used to tell user about error in case error is found - int line; - int column; - - // This constructor will allow us to easily make the tokens while coding - Token(TokenType type, std::string val, int l, int c); -}; - - - - - +#pragma once + +#include + +enum class TokenType { + DEF, + RETURN, + IF, + ELIF, + ELSE, + FOR, + WHILE, + BREAK, + CONTINUE, + PASS, + TRUE, + FALSE, + NONE, + AND, + OR, + NOT, + IN, + IS, + + IDENTIFIER, + NUMBER, + STRING, + + PLUS, // + + MINUS, // - + STAR, // * + SLASH, // / + FLOORDIV, // // + POWER, // ** + ASSIGN, // = + MODULO, // % + GREATERTHAN, // > + LESSERTHAN, // < + GREATEREQUAL, // >= + LESSEQUAL, // <= + EQEQUAL, // == + NOTEQUAL, // != + PLUSEQUAL, // += + MINUSEQUAL, // -= + STAREQUAL, // *= + SLASHEQUAL, // /= + + + LPAREN, // ( + RPAREN, // ) + + NEWLINE, + EOF_TOKEN, + + COMMA, // , + COLON, // : + INDENT, + DEDENT, + COMMENT, // # + PRINT, + AMPERSAND, // & + PIPE, // | + SPACE, +}; + + +class Token { +public: + // Main part of a token + TokenType type; + std::string value; + + // Used to tell user about error in case error is found + int line; + int column; + + // This constructor will allow us to easily make the tokens while coding + Token(TokenType type, std::string val, int l, int c); +}; + + + + + diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp index e69de29..460cf11 100644 --- a/src/lexer/keywords.hpp +++ b/src/lexer/keywords.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +#include "common/token/token.hpp" + +inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"if", TokenType::IF}, + {"elif", TokenType::ELIF}, + {"else", TokenType::ELSE}, + {"for", TokenType::FOR}, + {"while", TokenType::WHILE}, + {"break", TokenType::BREAK}, + {"continue", TokenType::CONTINUE}, + {"pass", TokenType::PASS}, + {"True", TokenType::TRUE}, + {"False", TokenType::FALSE}, + {"None", TokenType::NONE}, + {"and", TokenType::AND}, + {"or", TokenType::OR}, + {"not", TokenType::NOT}, + {"in", TokenType::IN}, + {"is", TokenType::IS}, + {"print", TokenType::PRINT}, +}; + +inline const std::unordered_map operators = { + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"//", TokenType::FLOORDIV}, + {"**", TokenType::POWER}, + {"=", TokenType::ASSIGN}, + {"%", TokenType::MODULO}, + {">", TokenType::GREATERTHAN}, + {"<", TokenType::LESSERTHAN}, + {">=", TokenType::GREATEREQUAL}, + {"<=", TokenType::LESSEQUAL}, + {"==", TokenType::EQEQUAL}, + {"!=", TokenType::NOTEQUAL}, + {"+=", TokenType::PLUSEQUAL}, + {"-=", TokenType::MINUSEQUAL}, + {"*=", TokenType::STAREQUAL}, + {"/=", TokenType::SLASHEQUAL}, + {"&", TokenType::AMPERSAND}, + {"|", TokenType::PIPE}, +}; + +inline const std::unordered_map delimiters = { + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {",", TokenType::COMMA}, + {":", TokenType::COLON}, +// {"#", TokenType::COMMENT}, + {"\n", TokenType::NEWLINE}, +}; diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index e69de29..e9bccc9 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -0,0 +1,280 @@ +#include // for exponentiation in scientific notation +#include +#include +#include +#include + +Lexer::Lexer(std::string input_string) : source_code(input_string) { + indent_stack.push(0); // stack initially must have 0 +} +bool Lexer::isAtEnd() const { // checks if we are at the end of source code + if (current_index >= source_code.size()) { + return true; + } + return false; +} + +char Lexer::peek() const { // returns character at current index + if (isAtEnd()) { + return '\0'; + } + return source_code[current_index]; +} + +char Lexer::peekNext() const { // returns character at next index + if (current_index + 1 >= source_code.size()) { + return '\0'; + } + return source_code[current_index + 1]; +} + +char Lexer::advance() { // returns character at current index and + if (isAtEnd()) { // makes necessary changes to line and column + return '\0'; + } + char c = source_code[current_index++]; + if (c == '\n') { + line++; + column = 1; + } else { + column++; + } + return c; +} + +void Lexer::processIndent() { // maintains an indent stack and adds indent and dedent tokens + // wherever necessary + int indent = 0; + + while (peek() == ' ' || peek() == '\t') { // handles spaces and tabs + switch (peek()) { + case '\t': + indent += 4; + break; + default: + indent += 1; + } + advance(); + } + if (peek() == '\n' || peek() == '\0' || peek() == '#') + return; // empty line, EOF, comments don't affect indentation + + if (indent > indent_stack.top()) { // current indent is larger; add indent token + indent_stack.push(indent); + Token token(TokenType::INDENT, "", line, 1); + tokens.push_back(token); + } else if (indent < indent_stack.top()) { // add dedent tokens until indentations match + while (!indent_stack.empty() && + indent_stack.top() != indent) { // current indent must be present elsewhere in the + indent_stack.pop(); // stack, else it is an error + Token token(TokenType::DEDENT, "", line, 1); + tokens.push_back(token); + } + if (indent_stack.empty()) { + // throw an error - invalid indentation + } + } +} + +void Lexer::scanNumber(std::string num) { + int start = column - 1; // column of the number token + + if (num == "0" && std::isdigit(peek())) { + // throw an error, leading zero + } + + while (std::isdigit(peek())) { + num += Lexer::advance(); + } + if (peek() == '.' && num[0] == '.') { + // throw an error - we can have .23 and 23.23, but .23.23 is error + } + if (peek() == '.' && std::isdigit(peekNext())) { // for floating point numbers + num += advance(); + while (std::isdigit(peek())) { + num += advance(); + } + } + + if ((peek() == 'e' || peek() == 'E')) { // scientific notation + num += advance(); + if (!std::isdigit(peek()) && peek() != '+' && peek() != '-') { + // throw an error - invalid syntax + } + // power can only be an integer in scientific notation in python + if (peek() == '+' || peek() == '-') { + if (!std::isdigit(peekNext())) { + // throw an error - invalid syntax + } + num += advance(); + } + while (std::isdigit(peek())) { + num += advance(); + } + } + + if (std::isalnum(peek()) || peek() == '_' || peek() == '.') { + // throw an error, invalid floating point number + } + + Token token(TokenType::NUMBER, num, line, start); + tokens.push_back(token); +} + +void Lexer::scanString(std::string str) { + int start = column - 1; + while (!isAtEnd() && peek() != str[0] && peek() != '\n') { + str += advance(); + } + if (isAtEnd()) { + // throw an error + } else if (peek() == '\n') { + // throw an error + } else { + str += advance(); + } + Token token(TokenType::STRING, str, line, start); + tokens.push_back(token); +} + +void Lexer::scanIdentifier(std::string identifier) { + int start = column - 1; + while (std::isalnum(peek()) || peek() == '_') { + identifier += advance(); + } + Token token(TokenType::IDENTIFIER, identifier, line, start); + if (keywords.count(identifier)) { + token.type = keywords.at(identifier); + } + tokens.push_back(token); +} + +std::vector Lexer::scan_Tokens() { + while (true) { + if (isAtEnd()) { + Token token(TokenType::EOF_TOKEN, "", line, column); + tokens.push_back(token); + break; + } + std::string curr = ""; + curr += advance(); + + if (curr == " " || curr == "\t") { + continue; + } + + // handle two-character operators + if (curr == "=" && peek() == '=') { + advance(); + Token token(TokenType::EQEQUAL, "==", line, + column - 2); // using column-2 since we are advancing twice + tokens.push_back(token); + continue; + } + + if (curr == ">" && peek() == '=') { + advance(); + Token token(TokenType::GREATEREQUAL, ">=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "<" && peek() == '=') { + advance(); + Token token(TokenType::LESSEQUAL, "<=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "+" && peek() == '=') { + advance(); + Token token(TokenType::PLUSEQUAL, "+=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "-" && peek() == '=') { + advance(); + Token token(TokenType::MINUSEQUAL, "-=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "*" && peek() == '=') { + advance(); + Token token(TokenType::STAREQUAL, "*=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "/" && peek() == '=') { + advance(); + Token token(TokenType::SLASHEQUAL, "/=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "/" && peek() == '/') { + advance(); + Token token(TokenType::FLOORDIV, "//", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "*" && peek() == '*') { + advance(); + Token token(TokenType::POWER, "**", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "!") { + if (peek() == '=') { + advance(); + Token token(TokenType::NOTEQUAL, "!=", line, column - 2); + tokens.push_back(token); + continue; + } else { + // throw an error + } + } + + auto it = operators.find( + curr); // curr cannot be found in keywords at this point; it is a single character + if ((it = operators.find(curr)) != operators.end()) { + TokenType type = it->second; + Token token(type, curr, line, column); + tokens.push_back(token); + } else if ((it = delimiters.find(curr)) != delimiters.end()) { + TokenType type = it->second; + + if (type == TokenType::NEWLINE) { // add newline token, and process indentation + column = 1; + Token token(type, curr, line - 1, column); + tokens.push_back(token); + processIndent(); + } else { + Token token(type, curr, line, column); + tokens.push_back(token); + } + } else { + if (curr == "\"" || curr == "'") { + scanString(curr); + } else if (std::isdigit(curr[0]) || + (curr[0] == '.' && + std::isdigit(peek()))) { // .23 is also valid syntax in python + scanNumber(curr); + } else if (std::isalpha(curr[0]) || curr[0] == '_') { // _foo is also valid + scanIdentifier(curr); + } else if (curr[0] == '#') { // comment, ignore everything until newline or EOF + curr = ""; + while (peek() != '\n' && peek() != '\0') { + advance(); + } + } else { + // throw an error - unexpected character + } + } + } + return tokens; +} diff --git a/src/lexer/lexer.hpp b/src/lexer/lexer.hpp index cb547a2..ad06881 100644 --- a/src/lexer/lexer.hpp +++ b/src/lexer/lexer.hpp @@ -1,53 +1,40 @@ -#pragma once - -#include -#include -#include - -#include "token.hpp" - -class Lexer { -public: - Lexer(std::string input_string); - std::vector scan_Tokens(void); - -private: - std::string source_code; - static inline const std::unordered_map keywords = { - {"def", TokenType::DEF}, - {"return", TokenType::RETURN}, - {"if", TokenType::IF}, - {"else", TokenType::ELSE}, - {"print", TokenType::PRINT}, - {"=", TokenType::ASSIGN}, - {"+", TokenType::PLUS}, - {"-", TokenType::MINUS}, - {"*", TokenType::STAR}, - {"/", TokenType::SLASH}, - {"(", TokenType::LPAREN}, - {")", TokenType::RPAREN}, - {":", TokenType::COLON}, - {",", TokenType::COMMA}, - {"\n", TokenType::NEWLINE}, - {"\t", TokenType::INDENT} - }; - std::vector tokens; - int start = 0; - int current_index = 0; - int line = 1; - - - // FUNCTIONS NEEDED FOR LEXER TO WORK - bool isAtEnd(void); // Checks for last character - char advance(void); // Return current char and move forward - char peek(void); // Sometimes, we don't actually want to read a character and may only want to peek at it - char peekNext(void); // Peak at the next character - - void addToken(TokenType type); - - // Specific scanners for complex types - void scanString(void); - void scanNumber(void); - void scanIdentifier(void); - std::string preprocess_indents(std::string raw); -}; +#pragma once + +#include +#include +#include +#include //to check if char is alphanumeric +#include + +#include "common/token/token.hpp" + +class Lexer { +public: + Lexer(std::string input_string); + std::vector scan_Tokens(); + + +private: + std::string source_code; + std::vector tokens; + int start = 0; + int current_index = 0; + int line = 1; + int column = 1; + std::stack indent_stack; + + + // FUNCTIONS NEEDED FOR LEXER TO WORK + bool isAtEnd() const; // Checks for last character + char advance(); // Return current char and move forward + char peek() const; // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext() const; // Peak at the next character + + void addToken(TokenType type); + + // Specific scanners for complex types + void scanString(std::string first); + void scanNumber(std::string first); + void scanIdentifier(std::string first); + void processIndent(); // To process indents at the start of every line +}; diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp new file mode 100644 index 0000000..9812da6 --- /dev/null +++ b/tests/lexer_test.cpp @@ -0,0 +1,51 @@ +#include +#include "lexer/lexer.hpp" + +int passed = 0; +int failed = 0; + +void test(std::string input_code, std::vector> expected) { + Lexer lexer(input_code); + std::vector tokens = lexer.scan_Tokens(); + + for (int i=0; i 0 ? 1 : 0; +}