From 8ae90c214b98eb6cc38f7e64d85b8b3beefacb38 Mon Sep 17 00:00:00 2001 From: Sasank <213149805+sash070@users.noreply.github.com> Date: Mon, 2 Feb 2026 21:59:28 +0530 Subject: [PATCH 01/23] cahnge --- madhav.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/madhav.txt b/madhav.txt index 9012a49..51568ea 100644 --- a/madhav.txt +++ b/madhav.txt @@ -1 +1 @@ -i like madhav more than no one +i like madhav more than sujal From fd7bb9d4629e9c4fee44a0ee46f1490c2baf68b8 Mon Sep 17 00:00:00 2001 From: Aayush Ranjan Date: Mon, 2 Feb 2026 22:10:37 +0530 Subject: [PATCH 02/23] test1 --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..e69de29 From 0f08280b405a014886ebf57efc85dc6cece47828 Mon Sep 17 00:00:00 2001 From: Aayush Ranjan Date: Mon, 2 Feb 2026 22:23:35 +0530 Subject: [PATCH 03/23] hi --- test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test.txt b/test.txt index e69de29..c997bcb 100644 --- a/test.txt +++ b/test.txt @@ -0,0 +1 @@ +hi \ No newline at end of file From 9f279e5626edd2c91457bd22e7ac7588abbeec05 Mon Sep 17 00:00:00 2001 From: Aayush Ranjan Date: Mon, 2 Feb 2026 22:26:50 +0530 Subject: [PATCH 04/23] hi --- test.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test.txt b/test.txt index c997bcb..94f999c 100644 --- a/test.txt +++ b/test.txt @@ -1 +1,2 @@ -hi \ No newline at end of file +hi +hol \ No newline at end of file From 1372a8c69a77fa0baddbc0ce22029928b1d3ca72 Mon Sep 17 00:00:00 2001 From: AayushRanjan10 Date: Mon, 2 Feb 2026 22:28:26 +0530 Subject: [PATCH 05/23] hi --- test.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test.txt b/test.txt index 94f999c..85af2dd 100644 --- a/test.txt +++ b/test.txt @@ -1,2 +1,3 @@ hi -hol \ No newline at end of file +hol +hu \ No newline at end of file From 82657a957a58b99f8ff5c8f82606e4f345bed547 Mon Sep 17 00:00:00 2001 From: AayushRanjan10 Date: Mon, 2 Feb 2026 22:28:45 +0530 Subject: [PATCH 06/23] hi --- test.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test.txt b/test.txt index 85af2dd..bd60fba 100644 --- a/test.txt +++ b/test.txt @@ -1,3 +1,4 @@ hi hol -hu \ No newline at end of file +hu +hlo \ No newline at end of file From 7149b44cecf34cbcee6f91ad1b1e7a274c46ec90 Mon Sep 17 00:00:00 2001 From: AayushRanjan10 Date: Mon, 2 Feb 2026 22:30:06 +0530 Subject: [PATCH 07/23] hlo --- test.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test.txt b/test.txt index bd60fba..0bc00d4 100644 --- a/test.txt +++ b/test.txt @@ -1,4 +1,6 @@ hi hol hu -hlo \ No newline at end of file +hlo + +i am super xrazt \ No newline at end of file From b220e9b507fcb9ecf600dc478eb09d3f53c6ffad Mon Sep 17 00:00:00 2001 From: Sasank <213149805+sash070@users.noreply.github.com> Date: Mon, 2 Feb 2026 22:40:44 +0530 Subject: [PATCH 08/23] change --- madhav.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/madhav.txt b/madhav.txt index 51568ea..0a1fc2e 100644 --- a/madhav.txt +++ b/madhav.txt @@ -1 +1,2 @@ i like madhav more than sujal +change From 81b9a990832d76e8483c3a3fe921bcd1d6673e30 Mon Sep 17 00:00:00 2001 From: Sasank <213149805+sash070@users.noreply.github.com> Date: Sat, 28 Feb 2026 17:03:31 +0530 Subject: [PATCH 09/23] added basic lexer implementation except errors and indent stack --- .vscode/settings.json | 3 + src/common/token/token.cpp | 5 ++ src/common/token/token.hpp | 6 +- src/lexer/keywords.hpp | 27 +++++++++ src/lexer/lexer.cpp | 117 +++++++++++++++++++++++++++++++++++++ src/lexer/lexer.hpp | 37 ++++-------- 6 files changed, 168 insertions(+), 27 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 src/common/token/token.cpp diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ad52559 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "makefile.configureOnOpen": true +} diff --git a/src/common/token/token.cpp b/src/common/token/token.cpp new file mode 100644 index 0000000..cc33e7e --- /dev/null +++ b/src/common/token/token.cpp @@ -0,0 +1,5 @@ +#include +#include + +Token::Token(TokenType type, std::string val, int l, int c) : type(type), value(val), line(l), column(c){} + diff --git a/src/common/token/token.hpp b/src/common/token/token.hpp index 7489d7a..8f2196d 100644 --- a/src/common/token/token.hpp +++ b/src/common/token/token.hpp @@ -33,7 +33,11 @@ enum class TokenType { COLON, // : INDENT, DEDENT, - COMMENT // # + COMMENT, // # + PRINT, + AMPERSAND, // & + PIPE, // | + SPACE, }; diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp index e69de29..838abd5 100644 --- a/src/lexer/keywords.hpp +++ b/src/lexer/keywords.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +#include "token.hpp" + +inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"if", TokenType::IF}, + {"else", TokenType::ELSE}, + {"print", TokenType::PRINT}, + {"=", TokenType::ASSIGN}, + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {":", TokenType::COLON}, + {",", TokenType::COMMA}, + {"\n", TokenType::NEWLINE}, + {"\t", TokenType::INDENT}, + {" ", TokenType::SPACE} + }; diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index e69de29..54dcd69 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include + +Lexer::Lexer(std::string input_string) : source_code(input_string){} +bool Lexer::isAtEnd(){ + if (current_index >= source_code.size()){ + return true; + } + return false; +} +char Lexer::peek(){ + if (current_index >= source_code.size()){ + return '\0'; + } + return source_code[current_index]; +} + +char Lexer::peekNext(){ + if (current_index+1 >= source_code.size()){ + return '\0'; + } + return source_code[current_index+1]; +} + +char Lexer::advance(){ + if (current_index >= source_code.size()){ + return '\0'; + } + char c = source_code[current_index++]; + if (c == '\n'){ + line++; + column=1; + } + else {column++;} + return c; +} + +void Lexer::scanNumber(std::string curr){ + int start = column-1; + while (std::isdigit(peek())){ + curr += Lexer::advance(); + } + if (std::isalnum(peek())){ + // throw an error + } + Token token(TokenType::NUMBER, curr, line, start); + tokens.push_back(token); +} + +void Lexer::scanString(std::string quote){ + int start = column-1; + while (!isAtEnd() && peek()!=quote[0] && peek()!='\n'){ + quote += advance(); + } + if (isAtEnd()){ + // throw an error + } + else if (peek()=='\n'){ + // throw an error + } + else{ + quote += advance(); + } + Token token(TokenType::STRING, quote, line, start); + tokens.push_back(token); +} + +void Lexer::scanIdentifier(std::string curr){ + int start = column-1; + while (std::isalnum(peek()) || peek()=='_'){ + curr += advance(); + } + Token token(TokenType::IDENTIFIER, curr, line, start); + if (keywords.count(curr)){ + token.type = keywords.at(curr); + } + tokens.push_back(token); +} + + +std::vector Lexer::scan_Tokens(){ + while (true){ + if (isAtEnd()){ + Token token(TokenType::EOF_TOKEN, "", line, column); + tokens.push_back(token); + break; + } + std::string curr = ""; + curr += advance(); + if (keywords.count(curr)){ + TokenType type = keywords.at(curr); + if (type == TokenType::NEWLINE){ + column = 0; + line++; + } + else if (type == TokenType::INDENT){ + column += 3; + } + else if (type == TokenType::DEDENT){ + column -= 5; + } + Token token(type, curr, line, column); + tokens.push_back(token); + } + else{ + if (std::isdigit(curr[0])){ + scanNumber(curr); + } + else if (std::isalpha(curr[0])){ + scanIdentifier(curr); + } + } +} +} diff --git a/src/lexer/lexer.hpp b/src/lexer/lexer.hpp index cb547a2..ae15f7a 100644 --- a/src/lexer/lexer.hpp +++ b/src/lexer/lexer.hpp @@ -3,51 +3,36 @@ #include #include #include +#include //to check if char is alphanumeric #include "token.hpp" class Lexer { public: Lexer(std::string input_string); - std::vector scan_Tokens(void); + std::vector scan_Tokens(); + private: std::string source_code; - static inline const std::unordered_map keywords = { - {"def", TokenType::DEF}, - {"return", TokenType::RETURN}, - {"if", TokenType::IF}, - {"else", TokenType::ELSE}, - {"print", TokenType::PRINT}, - {"=", TokenType::ASSIGN}, - {"+", TokenType::PLUS}, - {"-", TokenType::MINUS}, - {"*", TokenType::STAR}, - {"/", TokenType::SLASH}, - {"(", TokenType::LPAREN}, - {")", TokenType::RPAREN}, - {":", TokenType::COLON}, - {",", TokenType::COMMA}, - {"\n", TokenType::NEWLINE}, - {"\t", TokenType::INDENT} - }; std::vector tokens; int start = 0; int current_index = 0; int line = 1; + int column = 1; // FUNCTIONS NEEDED FOR LEXER TO WORK - bool isAtEnd(void); // Checks for last character - char advance(void); // Return current char and move forward - char peek(void); // Sometimes, we don't actually want to read a character and may only want to peek at it - char peekNext(void); // Peak at the next character + bool isAtEnd(); // Checks for last character + char advance(); // Return current char and move forward + char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext(); // Peak at the next character void addToken(TokenType type); // Specific scanners for complex types - void scanString(void); - void scanNumber(void); - void scanIdentifier(void); + void scanString(std::string first); + void scanNumber(std::string first); + void scanIdentifier(std::string first); std::string preprocess_indents(std::string raw); }; From 60a3910dfec9c88dece852dea369c7fafd8e03ea Mon Sep 17 00:00:00 2001 From: Sasank <213149805+sash070@users.noreply.github.com> Date: Sat, 28 Feb 2026 17:03:31 +0530 Subject: [PATCH 10/23] added basic lexer implementation except errors and indent stack --- .vscode/settings.json | 3 + src/common/token/token.cpp | 5 ++ src/common/token/token.hpp | 6 +- src/lexer/keywords.hpp | 27 +++++++++ src/lexer/lexer.cpp | 117 +++++++++++++++++++++++++++++++++++++ src/lexer/lexer.hpp | 37 ++++-------- 6 files changed, 168 insertions(+), 27 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 src/common/token/token.cpp diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ad52559 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "makefile.configureOnOpen": true +} diff --git a/src/common/token/token.cpp b/src/common/token/token.cpp new file mode 100644 index 0000000..cc33e7e --- /dev/null +++ b/src/common/token/token.cpp @@ -0,0 +1,5 @@ +#include +#include + +Token::Token(TokenType type, std::string val, int l, int c) : type(type), value(val), line(l), column(c){} + diff --git a/src/common/token/token.hpp b/src/common/token/token.hpp index 7489d7a..8f2196d 100644 --- a/src/common/token/token.hpp +++ b/src/common/token/token.hpp @@ -33,7 +33,11 @@ enum class TokenType { COLON, // : INDENT, DEDENT, - COMMENT // # + COMMENT, // # + PRINT, + AMPERSAND, // & + PIPE, // | + SPACE, }; diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp index e69de29..838abd5 100644 --- a/src/lexer/keywords.hpp +++ b/src/lexer/keywords.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +#include "token.hpp" + +inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"if", TokenType::IF}, + {"else", TokenType::ELSE}, + {"print", TokenType::PRINT}, + {"=", TokenType::ASSIGN}, + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {":", TokenType::COLON}, + {",", TokenType::COMMA}, + {"\n", TokenType::NEWLINE}, + {"\t", TokenType::INDENT}, + {" ", TokenType::SPACE} + }; diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index e69de29..54dcd69 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include + +Lexer::Lexer(std::string input_string) : source_code(input_string){} +bool Lexer::isAtEnd(){ + if (current_index >= source_code.size()){ + return true; + } + return false; +} +char Lexer::peek(){ + if (current_index >= source_code.size()){ + return '\0'; + } + return source_code[current_index]; +} + +char Lexer::peekNext(){ + if (current_index+1 >= source_code.size()){ + return '\0'; + } + return source_code[current_index+1]; +} + +char Lexer::advance(){ + if (current_index >= source_code.size()){ + return '\0'; + } + char c = source_code[current_index++]; + if (c == '\n'){ + line++; + column=1; + } + else {column++;} + return c; +} + +void Lexer::scanNumber(std::string curr){ + int start = column-1; + while (std::isdigit(peek())){ + curr += Lexer::advance(); + } + if (std::isalnum(peek())){ + // throw an error + } + Token token(TokenType::NUMBER, curr, line, start); + tokens.push_back(token); +} + +void Lexer::scanString(std::string quote){ + int start = column-1; + while (!isAtEnd() && peek()!=quote[0] && peek()!='\n'){ + quote += advance(); + } + if (isAtEnd()){ + // throw an error + } + else if (peek()=='\n'){ + // throw an error + } + else{ + quote += advance(); + } + Token token(TokenType::STRING, quote, line, start); + tokens.push_back(token); +} + +void Lexer::scanIdentifier(std::string curr){ + int start = column-1; + while (std::isalnum(peek()) || peek()=='_'){ + curr += advance(); + } + Token token(TokenType::IDENTIFIER, curr, line, start); + if (keywords.count(curr)){ + token.type = keywords.at(curr); + } + tokens.push_back(token); +} + + +std::vector Lexer::scan_Tokens(){ + while (true){ + if (isAtEnd()){ + Token token(TokenType::EOF_TOKEN, "", line, column); + tokens.push_back(token); + break; + } + std::string curr = ""; + curr += advance(); + if (keywords.count(curr)){ + TokenType type = keywords.at(curr); + if (type == TokenType::NEWLINE){ + column = 0; + line++; + } + else if (type == TokenType::INDENT){ + column += 3; + } + else if (type == TokenType::DEDENT){ + column -= 5; + } + Token token(type, curr, line, column); + tokens.push_back(token); + } + else{ + if (std::isdigit(curr[0])){ + scanNumber(curr); + } + else if (std::isalpha(curr[0])){ + scanIdentifier(curr); + } + } +} +} diff --git a/src/lexer/lexer.hpp b/src/lexer/lexer.hpp index cb547a2..ae15f7a 100644 --- a/src/lexer/lexer.hpp +++ b/src/lexer/lexer.hpp @@ -3,51 +3,36 @@ #include #include #include +#include //to check if char is alphanumeric #include "token.hpp" class Lexer { public: Lexer(std::string input_string); - std::vector scan_Tokens(void); + std::vector scan_Tokens(); + private: std::string source_code; - static inline const std::unordered_map keywords = { - {"def", TokenType::DEF}, - {"return", TokenType::RETURN}, - {"if", TokenType::IF}, - {"else", TokenType::ELSE}, - {"print", TokenType::PRINT}, - {"=", TokenType::ASSIGN}, - {"+", TokenType::PLUS}, - {"-", TokenType::MINUS}, - {"*", TokenType::STAR}, - {"/", TokenType::SLASH}, - {"(", TokenType::LPAREN}, - {")", TokenType::RPAREN}, - {":", TokenType::COLON}, - {",", TokenType::COMMA}, - {"\n", TokenType::NEWLINE}, - {"\t", TokenType::INDENT} - }; std::vector tokens; int start = 0; int current_index = 0; int line = 1; + int column = 1; // FUNCTIONS NEEDED FOR LEXER TO WORK - bool isAtEnd(void); // Checks for last character - char advance(void); // Return current char and move forward - char peek(void); // Sometimes, we don't actually want to read a character and may only want to peek at it - char peekNext(void); // Peak at the next character + bool isAtEnd(); // Checks for last character + char advance(); // Return current char and move forward + char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext(); // Peak at the next character void addToken(TokenType type); // Specific scanners for complex types - void scanString(void); - void scanNumber(void); - void scanIdentifier(void); + void scanString(std::string first); + void scanNumber(std::string first); + void scanIdentifier(std::string first); std::string preprocess_indents(std::string raw); }; From fcee90ae1df1c6dd9c4fa8bc3e0b040807261cdb Mon Sep 17 00:00:00 2001 From: Sujal Kumar Date: Mon, 2 Mar 2026 13:08:15 +0530 Subject: [PATCH 11/23] Minor tweak to improve readability --- src/lexer/lexer.cpp | 58 ++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index 54dcd69..4249533 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -11,6 +11,7 @@ bool Lexer::isAtEnd(){ } return false; } + char Lexer::peek(){ if (current_index >= source_code.size()){ return '\0'; @@ -80,38 +81,37 @@ void Lexer::scanIdentifier(std::string curr){ tokens.push_back(token); } - std::vector Lexer::scan_Tokens(){ while (true){ - if (isAtEnd()){ - Token token(TokenType::EOF_TOKEN, "", line, column); - tokens.push_back(token); - break; - } - std::string curr = ""; - curr += advance(); - if (keywords.count(curr)){ - TokenType type = keywords.at(curr); - if (type == TokenType::NEWLINE){ - column = 0; - line++; + if (isAtEnd()){ + Token token(TokenType::EOF_TOKEN, "", line, column); + tokens.push_back(token); + break; } - else if (type == TokenType::INDENT){ - column += 3; - } - else if (type == TokenType::DEDENT){ - column -= 5; + std::string curr = ""; + curr += advance(); + if (keywords.count(curr)){ + TokenType type = keywords.at(curr); + if (type == TokenType::NEWLINE){ + column = 0; + line++; + } + else if (type == TokenType::INDENT){ + column += 3; + } + else if (type == TokenType::DEDENT){ + column -= 5; + } + Token token(type, curr, line, column); + tokens.push_back(token); } - Token token(type, curr, line, column); - tokens.push_back(token); + else{ + if (std::isdigit(curr[0])){ + scanNumber(curr); + } + else if (std::isalpha(curr[0])){ + scanIdentifier(curr); + } + } } - else{ - if (std::isdigit(curr[0])){ - scanNumber(curr); - } - else if (std::isalpha(curr[0])){ - scanIdentifier(curr); - } - } -} } From 5216e68d1dce2a9e463bd5be3db63ac3b0af9fb6 Mon Sep 17 00:00:00 2001 From: Sujal Kumar <91939382+SujalKumar06@users.noreply.github.com> Date: Mon, 2 Mar 2026 11:42:08 +0530 Subject: [PATCH 12/23] Simplify CI workflow by removing configure and checks Removed unnecessary steps from the CI workflow. --- .github/workflows/c-cpp.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/workflows/c-cpp.yml diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml new file mode 100644 index 0000000..9ce9d32 --- /dev/null +++ b/.github/workflows/c-cpp.yml @@ -0,0 +1,17 @@ +name: C/C++ CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Build project + run: make From f005aa81d64ea8cf48972e6e20654374d8accebc Mon Sep 17 00:00:00 2001 From: Sujal Kumar Date: Mon, 2 Mar 2026 13:28:30 +0530 Subject: [PATCH 13/23] Adding format tools to the repo. --- .clang-format | 28 ++++++++++++++++++++++++++++ Makefile | 5 +++++ 2 files changed, 33 insertions(+) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..e7dbe41 --- /dev/null +++ b/.clang-format @@ -0,0 +1,28 @@ +--- +Language: Cpp +BasedOnStyle: Google + +# Basic Formatting +IndentWidth: 4 +ColumnLimit: 100 +TabWidth: 4 +UseTab: Never + +# Braces and Spacing +BreakBeforeBraces: Attach +AllowShortBlocksOnASingleLine: Empty +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false + +# Pointers and References (e.g., int* ptr instead of int *ptr) +PointerAlignment: Left + +# Alignment for readability +AlignConsecutiveAssignments: true +AlignConsecutiveDeclarations: false +AlignOperands: Align +AlignTrailingComments: true + +# Includes +SortIncludes: true diff --git a/Makefile b/Makefile index e82cc23..8c8b5f8 100644 --- a/Makefile +++ b/Makefile @@ -40,3 +40,8 @@ run: all clean: rm -rf $(BUILD_DIR) +format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format -i + +check-format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format --dry-run --Werror From 51caa2a9260c1571ca3770423e3396a2c090f977 Mon Sep 17 00:00:00 2001 From: Sujal Kumar Date: Mon, 2 Mar 2026 13:31:31 +0530 Subject: [PATCH 14/23] Formatted code using make --- src/common/token/token.cpp | 6 +-- src/lexer/lexer.cpp | 87 ++++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 49 deletions(-) diff --git a/src/common/token/token.cpp b/src/common/token/token.cpp index cc33e7e..b054974 100644 --- a/src/common/token/token.cpp +++ b/src/common/token/token.cpp @@ -1,5 +1,5 @@ -#include #include +#include -Token::Token(TokenType type, std::string val, int l, int c) : type(type), value(val), line(l), column(c){} - +Token::Token(TokenType type, std::string val, int l, int c) + : type(type), value(val), line(l), column(c) {} diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index 4249533..02a2d6a 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -1,117 +1,112 @@ -#include #include #include -#include #include +#include +#include -Lexer::Lexer(std::string input_string) : source_code(input_string){} -bool Lexer::isAtEnd(){ - if (current_index >= source_code.size()){ +Lexer::Lexer(std::string input_string) : source_code(input_string) {} +bool Lexer::isAtEnd() { + if (current_index >= source_code.size()) { return true; } return false; } -char Lexer::peek(){ - if (current_index >= source_code.size()){ +char Lexer::peek() { + if (current_index >= source_code.size()) { return '\0'; } return source_code[current_index]; } -char Lexer::peekNext(){ - if (current_index+1 >= source_code.size()){ +char Lexer::peekNext() { + if (current_index + 1 >= source_code.size()) { return '\0'; } - return source_code[current_index+1]; + return source_code[current_index + 1]; } -char Lexer::advance(){ - if (current_index >= source_code.size()){ +char Lexer::advance() { + if (current_index >= source_code.size()) { return '\0'; } char c = source_code[current_index++]; - if (c == '\n'){ + if (c == '\n') { line++; - column=1; + column = 1; + } else { + column++; } - else {column++;} return c; } -void Lexer::scanNumber(std::string curr){ - int start = column-1; - while (std::isdigit(peek())){ +void Lexer::scanNumber(std::string curr) { + int start = column - 1; + while (std::isdigit(peek())) { curr += Lexer::advance(); } - if (std::isalnum(peek())){ + if (std::isalnum(peek())) { // throw an error } Token token(TokenType::NUMBER, curr, line, start); tokens.push_back(token); } -void Lexer::scanString(std::string quote){ - int start = column-1; - while (!isAtEnd() && peek()!=quote[0] && peek()!='\n'){ +void Lexer::scanString(std::string quote) { + int start = column - 1; + while (!isAtEnd() && peek() != quote[0] && peek() != '\n') { quote += advance(); } - if (isAtEnd()){ + if (isAtEnd()) { // throw an error - } - else if (peek()=='\n'){ + } else if (peek() == '\n') { // throw an error - } - else{ + } else { quote += advance(); } Token token(TokenType::STRING, quote, line, start); tokens.push_back(token); } -void Lexer::scanIdentifier(std::string curr){ - int start = column-1; - while (std::isalnum(peek()) || peek()=='_'){ +void Lexer::scanIdentifier(std::string curr) { + int start = column - 1; + while (std::isalnum(peek()) || peek() == '_') { curr += advance(); } Token token(TokenType::IDENTIFIER, curr, line, start); - if (keywords.count(curr)){ + if (keywords.count(curr)) { token.type = keywords.at(curr); } tokens.push_back(token); } -std::vector Lexer::scan_Tokens(){ - while (true){ - if (isAtEnd()){ +std::vector Lexer::scan_Tokens() { + while (true) { + if (isAtEnd()) { Token token(TokenType::EOF_TOKEN, "", line, column); tokens.push_back(token); break; } std::string curr = ""; curr += advance(); - if (keywords.count(curr)){ + if (keywords.count(curr)) { TokenType type = keywords.at(curr); - if (type == TokenType::NEWLINE){ + if (type == TokenType::NEWLINE) { column = 0; line++; - } - else if (type == TokenType::INDENT){ + } else if (type == TokenType::INDENT) { column += 3; - } - else if (type == TokenType::DEDENT){ + } else if (type == TokenType::DEDENT) { column -= 5; } Token token(type, curr, line, column); tokens.push_back(token); - } - else{ - if (std::isdigit(curr[0])){ + } else { + if (std::isdigit(curr[0])) { scanNumber(curr); - } - else if (std::isalpha(curr[0])){ + } else if (std::isalpha(curr[0])) { scanIdentifier(curr); } - } + } } } From 5fd758734414c501c22ff821026600f04f2acdf9 Mon Sep 17 00:00:00 2001 From: sash070 <213149805+sash070@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:10:40 +0530 Subject: [PATCH 15/23] Added additional keywords to the lexer --- src/lexer/keywords.hpp | 49 +++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp index 838abd5..66aa289 100644 --- a/src/lexer/keywords.hpp +++ b/src/lexer/keywords.hpp @@ -7,21 +7,36 @@ #include "token.hpp" inline const std::unordered_map keywords = { - {"def", TokenType::DEF}, - {"return", TokenType::RETURN}, - {"if", TokenType::IF}, - {"else", TokenType::ELSE}, - {"print", TokenType::PRINT}, - {"=", TokenType::ASSIGN}, - {"+", TokenType::PLUS}, - {"-", TokenType::MINUS}, - {"*", TokenType::STAR}, - {"/", TokenType::SLASH}, - {"(", TokenType::LPAREN}, - {")", TokenType::RPAREN}, - {":", TokenType::COLON}, - {",", TokenType::COMMA}, - {"\n", TokenType::NEWLINE}, - {"\t", TokenType::INDENT}, - {" ", TokenType::SPACE} + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"for", TokenType::FOR}, + {"while", TokenType::WHILE}, + {"break", TokenType::BREAK}, + {"continue",TokenType::CONTINUE}, + {"pass", TokenType::PASS}, + {"if", TokenType::IF}, + {"elif", TokenType::ELIF}, + {"else", TokenType::ELSE}, + {"print", TokenType::PRINT}, + {"=", TokenType::ASSIGN}, + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {":", TokenType::COLON}, + {",", TokenType::COMMA}, + {"\n", TokenType::NEWLINE}, + {"\t", TokenType::INDENT}, + {" ", TokenType::SPACE}, + {"and", TokenType::AND}, + {"or", TokenType::OR}, + {"not", TokenType::NOT}, + {"in", TokenType::IN}, + {"is", TokenType::IS}, + {"False", TokenType::FALSE}, + {"True", TokenType::TRUE}, + {"None", TokenType::NONE}, + }; From a963eb23f45eb0ecbd52f22162b9832d346f9290 Mon Sep 17 00:00:00 2001 From: sash070 <213149805+sash070@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:15:09 +0530 Subject: [PATCH 16/23] Added new token types to TokenType enum --- src/common/token/token.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/common/token/token.hpp b/src/common/token/token.hpp index 8f2196d..3cc4117 100644 --- a/src/common/token/token.hpp +++ b/src/common/token/token.hpp @@ -6,9 +6,21 @@ enum class TokenType { DEF, RETURN, IF, + ELIF, ELSE, FOR, WHILE, + BREAK, + CONTINUE, + PASS, + TRUE, + FALSE, + NONE, + AND, + OR, + NOT, + IN, + IS, IDENTIFIER, NUMBER, @@ -22,6 +34,10 @@ enum class TokenType { MODULO, // % GREATERTHAN, // > LESSERTHAN, // < + GREATEREQUAL, // >= + LESSEQUAL, // <= + EQEQUAL, // == + LPAREN, // ( RPAREN, // ) From c28302122b69570444e2cde8c90c6507d539f136 Mon Sep 17 00:00:00 2001 From: AayushRanjan10 Date: Thu, 5 Mar 2026 23:53:12 +0530 Subject: [PATCH 17/23] Implemented floating point numbers, two-character operators --- .DS_Store | Bin 0 -> 8196 bytes Lexer/LexerClasses - Part 1.cpp | 144 ++++++++++++++++++++++++ Makefile | 47 ++++++++ madhav.txt | 1 - src/.DS_Store | Bin 0 -> 8196 bytes src/common/.DS_Store | Bin 0 -> 6148 bytes src/common/token/token.cpp | 5 + src/common/token/token.hpp | 82 ++++++++++++++ src/lexer/indentation.cpp | 0 src/lexer/indentation.hpp | 0 src/lexer/keywords.hpp | 43 +++++++ src/lexer/lexer.cpp | 192 ++++++++++++++++++++++++++++++++ src/lexer/lexer.hpp | 38 +++++++ test.txt | 6 - 14 files changed, 551 insertions(+), 7 deletions(-) create mode 100644 .DS_Store create mode 100644 Lexer/LexerClasses - Part 1.cpp create mode 100644 Makefile delete mode 100644 madhav.txt create mode 100644 src/.DS_Store create mode 100644 src/common/.DS_Store create mode 100644 src/common/token/token.cpp create mode 100644 src/common/token/token.hpp create mode 100644 src/lexer/indentation.cpp create mode 100644 src/lexer/indentation.hpp create mode 100644 src/lexer/keywords.hpp create mode 100644 src/lexer/lexer.cpp create mode 100644 src/lexer/lexer.hpp delete mode 100644 test.txt diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..ccaee7e4417edc631a3f75c56c5f5f14c0b632d3 GIT binary patch literal 8196 zcmeHMO-~a+7=EXKY(-@GQpA{S?8O9vRZwG$p%jQVen2Qe5U_6Bp=?-on%(6)Bt7fN zAE0-yo=m)O^y1Z<@h9ljlfLs&Sc)74f-}j?^X|<1%*V4kJ2SfgK+-G48vtDZU|{5E zHevFYz{`0&5MnBa^9b?*{EW*Ad6(r6`05O;fL1^&pcT*xXa!D&0=Q=LU>vgU%Tk?c z1+)VHr2_o@U}5B#Qdp5FULBZ-BLHGOW{ZF}o&yBOQkYU$ktnEGr#w9nMJ4KpK_nf^ zE$)t(Qdp5F=|ChMh-#Ur6ABTnBhM1{SELKD6C_Qp5JjI#rP!a z`z?Z?Mm+OtL0O&jKjzD>6NJg+R}!nMZ#dIvG#X9De0GtoWkWk$3DUMd%daLGbMsZ( z&QCdXEoXKNGcU9qkNSe(6e+sQJ#)MwTT8PQuNVk!Ru&j>Bc3z6wzdY6{k_($Qe|vj7ZgeKD-5#66S!a{x%(fcW30i!ymo>RjMxTzo3{l+!pHffMH@2`4&~Q3p&*aROpx`st9cO_P zI?S7y3sC7yNS8C-0>c(pQRT5RFAAopA0Qs!OwZW9Z+j)0-k5joJaxysGz&wvJna;i zCwtx^s>RE3pYkdbgFmH^iPWR2aZMT~B&30R%Z{q0kR)~%cZ5CZF^|R*sc~5n#|5AV zhTsWI!4ho38+Z%v;S+p@Z}0cNoLkMB!2*`(Iw*m literal 0 HcmV?d00001 diff --git a/Lexer/LexerClasses - Part 1.cpp b/Lexer/LexerClasses - Part 1.cpp new file mode 100644 index 0000000..d7c1ae9 --- /dev/null +++ b/Lexer/LexerClasses - Part 1.cpp @@ -0,0 +1,144 @@ +#include +#include +#include + +enum class TokenType { + DEF, + RETURN, + IF, + ELSE, + PRINT, + FOR, + WHILE, + + IDENTIFIER, + NUMBER, + STRING, + + PLUS, // + + MINUS, // - + STAR, // * + SLASH, // / + ASSIGN, // = + MODULO, // % + GREATERTHAN, // > + LESSERTHAN, // < + + LPAREN, // ( + RPAREN, // ) + + NEWLINE, + EOF_TOKEN, + + COMMA, // , + COLON, // : + INDENT, + DEDENT, + COMMENT // # +}; + + +class Token{ + public: + // Main part of a token + TokenType type; + std::string value; + + // Used to tell user about error in case error is found + int line; + int column; + + // This constructor will allow us to easily make the tokens while coding + Token(TokenType type, std::string val, int l, int c){ + this->type = type; this->value = val; + this->line = l; this->column = c; + } +}; + + +class Lexer{ + public: + Lexer(std::string input_string){ + this->source_code = preprocess_indents(input_string); + } + std::vector scan_Tokens(); + + private: + std::string source_code; + static inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"if", TokenType::IF}, + {"else", TokenType::ELSE}, + {"print", TokenType::PRINT}, + {"=", TokenType::ASSIGN}, + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {":", TokenType::COLON}, + {",", TokenType::COMMA}, + {"\n", TokenType::NEWLINE}, + {"\t", TokenType::INDENT}, + {"\r", TokenType::DEDENT} + }; + std::vector tokens; + int start = 0; + int current_index = 0; + int line = 1; + + + // FUNCTIONS NEEDED FOR LEXER TO WORK + + bool isAtEnd(); // Checks for last character + char advance(); // Return current char and move forward + char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext(); // Peak at the next character + + void addToken(TokenType type); + + // Specific scanners for complex types + void scanString(); + void scanNumber(); + void scanIdentifier(); + std::string preprocess_indents(std::string raw); +}; + + +std::string Lexer::preprocess_indents(std::string raw) { + std::string clean_code = ""; + bool atLineStart = true; + + for (int i = 0; i < raw.length(); i++) { + if (atLineStart) { + if (raw[i] == ' ') { + if (i + 3 < raw.length() && raw.substr(i, 4) == " ") { + clean_code += '\t'; + i += 3; + continue; + } + } + else if (raw[i] == '\t') { + clean_code += '\t'; + continue; + } + else if (raw[i] == '\n') { + clean_code += '\n'; + atLineStart = true; + continue; + } + else { + atLineStart = false; + } + } + clean_code += raw[i]; + + if (raw[i] == '\n') { + atLineStart = true; + } + } + + return clean_code; +} \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8c8b5f8 --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +#vars +TARGET_EXEC := executable +BUILD_DIR := build +SRC_DIR := src + +SRCS := $(shell find $(SRC_DIR) -name '*.cpp') +OBJS := $(SRCS:$(SRC_DIR)/%.cpp=$(BUILD_DIR)/%.o) +DEPS := $(OBJS:.o=.d) +INCS := $(shell find $(SRC_DIR) -type d) + +INC_FLAGS := $(addprefix -I,$(INCS)) +CPPFLAGS := $(INC_FLAGS) -MMD -MP +LDFLAGS := + +CXX := g++ + +#all +all: $(BUILD_DIR)/$(TARGET_EXEC) + +#executable dependencies +$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS) + @echo "Linking" + mkdir -p $(BUILD_DIR) + $(CXX) $(OBJS) -o $@ $(LDFLAGS) + +#object dependencies +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp + @echo "Building dependencies" + mkdir -p $(dir $@) + $(CXX) $(CPPFLAGS) -c $< -o $@ + @echo + +#for change in header files +-include $(DEPS) + +.PHONY: all clean run + +run: all + ./$(BUILD_DIR)/$(TARGET_EXEC) + +clean: + rm -rf $(BUILD_DIR) +format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format -i + +check-format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format --dry-run --Werror diff --git a/madhav.txt b/madhav.txt deleted file mode 100644 index 9012a49..0000000 --- a/madhav.txt +++ /dev/null @@ -1 +0,0 @@ -i like madhav more than no one diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f7b50e2bead4c6fa383f8ba08a142500c8948d4a GIT binary patch literal 8196 zcmeHLPjAyO6o1~|lrffVEtDaoh+dF3L16=g5L)Smph9TVI*pEPl(hA_XsKN#>Hca` z&v4=c@Bz4TLgKO`7p~k89|5kM;Mq=DyRs`ns~!3Mp6&OX_!m3&3jmOGt#kt*4gdyb zf#v{;za-Hva$D*OYeXYKe*g|F6Y3ZTNFOW2RETwj(iQtu(E|xxNf=@f z<&N!^Fh@*4PFh?iIXR<9D%? z(wMLPl1VzZ*cP_^Y8dN(C{$g?50c3*q^rB9x3AymHwKKg>;~V?26j;M)3&!FN{gJk zg@$eymK?gBH={G01-8SeCmoy;MU&NM4lD8PG_SFeFMYGB!H5`OI*LVkl%DGjK7jk4J7<*z@RGsk*`r}8Ix1o3dck~H^%hM zG#jIeoEbPfcy7oX9vK~u4##4#^B3Y5$1aWK&C!Bg+$?+IX=k|0y|i1SD`n3qWvh14 zVdY#vcY>_5NAu>$G1CQFdT~^>xLZY==uw62?vY0+ll{%DulqE}HR;QFbI31x+;!)< z?*tBK%d0+eoe5|q!`3<8aSM5#tBNLniF!U_n@?`p_B@-FX?l0fwF}gpV`&})ypnT@ zoT(=tlHFo)-lL++#PknEWg_*Yp5_CD?>Lcmr?Y9ejjO@D;wpPts3Ja)peODRQ4olgA`Oo|7WkAP(_V2VIR9eOZ+o zlK7~#yy&g0$6U_JSTB5f!37FCn7yW7SYiph$K1RGnk-Mr0%#hMp8ltI`ZvAiW0N0e zxK16%K(`ukT*UtmPyYYkalGgOYX&-r0n#&@ofQZD#@Xrk;6k6?3(?9201~GpegE{he12{3y=nKjE(KRL`kiqu1y+)|?I2bYeGdLt_k%)C@t|COM`1 z`cDs>88nB6M}{&(*=+XgxzY1u7skrwi04)|Yaw6j3=3FT2pVLi7W!4E?pAzSE5+nR z?D)H+Yz`lDT_DwGho;4yI{0J`4Pv+lAt6+ZH@ChXk+{^RFPF{1s1mXum}ik6`;0EH zM!j5jZ<5EH{U}i`OT|Pvsld6BNi>Bxcd&6F^hL@jN$2c&*^`DdW*pr^kI@pm zio57FdV}7g_vi!qjJ~4pxEGuF5+27>_%5Et4{;Gc#TC4PeH_XGx{}9yQMQvBN1(Mm z=4~gW0i!jrCJ~A22}6~>(SO47*6C^nGy{KTfbS0$6ppsSmPE02KqamKfc!T?2HD4-0*>WYD7IM^-aXDe(;l;H@AnPD6?GgdbgEM^D0CEO9VCF)u; zpcy#IK(}m~{QSTF`}_aVAl=doXa@c(25@TDndMh~b+&dS$In^|%5x}OxL-@62!To+ i$3o#p@hB7_&}VT4&{o)z2o@0iBOqwdm1f|tGVl}ElGV%r literal 0 HcmV?d00001 diff --git a/src/common/token/token.cpp b/src/common/token/token.cpp new file mode 100644 index 0000000..b054974 --- /dev/null +++ b/src/common/token/token.cpp @@ -0,0 +1,5 @@ +#include +#include + +Token::Token(TokenType type, std::string val, int l, int c) + : type(type), value(val), line(l), column(c) {} diff --git a/src/common/token/token.hpp b/src/common/token/token.hpp new file mode 100644 index 0000000..e80dbe5 --- /dev/null +++ b/src/common/token/token.hpp @@ -0,0 +1,82 @@ +#pragma once + +#include + +enum class TokenType { + DEF, + RETURN, + IF, + ELIF, + ELSE, + FOR, + WHILE, + BREAK, + CONTINUE, + PASS, + TRUE, + FALSE, + NONE, + AND, + OR, + NOT, + IN, + IS, + + IDENTIFIER, + NUMBER, + STRING, + + PLUS, // + + MINUS, // - + STAR, // * + SLASH, // / + ASSIGN, // = + MODULO, // % + GREATERTHAN, // > + LESSERTHAN, // < + GREATEREQUAL, // >= + LESSEQUAL, // <= + EQEQUAL, // == + NOTEQUAL, // != + PLUSEQUAL, // += + MINUSEQUAL, // -= + STAREQUAL, // *= + SLASHEQUAL, // /= + + + LPAREN, // ( + RPAREN, // ) + + NEWLINE, + EOF_TOKEN, + + COMMA, // , + COLON, // : + INDENT, + DEDENT, + COMMENT, // # + PRINT, + AMPERSAND, // & + PIPE, // | + SPACE, +}; + + +class Token { +public: + // Main part of a token + TokenType type; + std::string value; + + // Used to tell user about error in case error is found + int line; + int column; + + // This constructor will allow us to easily make the tokens while coding + Token(TokenType type, std::string val, int l, int c); +}; + + + + + diff --git a/src/lexer/indentation.cpp b/src/lexer/indentation.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/lexer/indentation.hpp b/src/lexer/indentation.hpp new file mode 100644 index 0000000..e69de29 diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp new file mode 100644 index 0000000..aa56bd1 --- /dev/null +++ b/src/lexer/keywords.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include +#include +#include + +#include "token.hpp" + +inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"for", TokenType::FOR}, + {"while", TokenType::WHILE}, + {"break", TokenType::BREAK}, + {"continue",TokenType::CONTINUE}, + {"pass", TokenType::PASS}, + {"if", TokenType::IF}, + {"elif", TokenType::ELIF}, + {"else", TokenType::ELSE}, + {"print", TokenType::PRINT}, + {"=", TokenType::ASSIGN}, + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {":", TokenType::COLON}, + {",", TokenType::COMMA}, + {"\n", TokenType::NEWLINE}, + {"\t", TokenType::INDENT}, + {"and", TokenType::AND}, + {"or", TokenType::OR}, + {"not", TokenType::NOT}, + {"in", TokenType::IN}, + {"is", TokenType::IS}, + {"False", TokenType::FALSE}, + {"True", TokenType::TRUE}, + {"None", TokenType::NONE}, + {">", TokenType::GREATERTHAN}, + {"<", TokenType::LESSERTHAN}, + {"%", TokenType::MODULO}, + }; diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp new file mode 100644 index 0000000..fd44757 --- /dev/null +++ b/src/lexer/lexer.cpp @@ -0,0 +1,192 @@ +#include +#include +#include +#include +#include + +Lexer::Lexer(std::string input_string) : source_code(input_string) {} +bool Lexer::isAtEnd() { + if (current_index >= source_code.size()) { + return true; + } + return false; +} + +char Lexer::peek() { + if (current_index >= source_code.size()) { + return '\0'; + } + return source_code[current_index]; +} + +char Lexer::peekNext() { + if (current_index + 1 >= source_code.size()) { + return '\0'; + } + return source_code[current_index + 1]; +} + +char Lexer::advance() { + if (current_index >= source_code.size()) { + return '\0'; + } + char c = source_code[current_index++]; + if (c == '\n') { + line++; + column = 1; + } else { + column++; + } + return c; +} + +void Lexer::scanNumber(std::string curr) { + int start = column - 1; + while (std::isdigit(peek())) { + curr += Lexer::advance(); + } + + if (peek() == '.' && std::isdigit(peekNext())) { + curr += advance(); + while (std::isdigit(peek())) { + curr += advance(); + } + } + + if (std::isalnum(peek()) || peek()=='_') { + // throw an error + } + + Token token(TokenType::NUMBER, curr, line, start); + tokens.push_back(token); +} + +void Lexer::scanString(std::string quote) { + int start = column - 1; + while (!isAtEnd() && peek() != quote[0] && peek() != '\n') { + quote += advance(); + } + if (isAtEnd()) { + // throw an error + } else if (peek() == '\n') { + // throw an error + } else { + quote += advance(); + } + Token token(TokenType::STRING, quote, line, start); + tokens.push_back(token); +} + +void Lexer::scanIdentifier(std::string curr) { + int start = column - 1; + while (std::isalnum(peek()) || peek() == '_') { + curr += advance(); + } + Token token(TokenType::IDENTIFIER, curr, line, start); + if (keywords.count(curr)) { + token.type = keywords.at(curr); + } + tokens.push_back(token); +} + +std::vector Lexer::scan_Tokens() { + while (true) { + if (isAtEnd()) { + Token token(TokenType::EOF_TOKEN, "", line, column); + tokens.push_back(token); + break; + } + std::string curr = ""; + curr += advance(); + + if (curr == " ") { + continue; + } + + // handle two-character operators + if (curr == "=" && peek() == '=') { + advance(); + Token token(TokenType::EQEQUAL, "==", line, column-2); // using column-2 since we are advancing twice + tokens.push_back(token); + continue; + } + + if (curr == ">" && peek() == '=') { + advance(); + Token token(TokenType::GREATEREQUAL, ">=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "<" && peek() == '=') { + advance(); + Token token(TokenType::LESSEQUAL, "<=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "+" && peek() == '=') { + advance(); + Token token(TokenType::PLUSEQUAL, "+=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "-" && peek() == '=') { + advance(); + Token token(TokenType::MINUSEQUAL, "-=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "*" && peek() == '=') { + advance(); + Token token(TokenType::STAREQUAL, "*=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "/" && peek() == '=') { + advance(); + Token token(TokenType::SLASHEQUAL, "/=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "!") { + if (peek() == '=') { + advance(); + Token token(TokenType::NOTEQUAL, "!=", line, column-2); + tokens.push_back(token); + continue; + } + else { + // throw an error + } + } + + if (keywords.count(curr)) { + TokenType type = keywords.at(curr); + if (type == TokenType::NEWLINE) { + column = 0; + line++; + } else if (type == TokenType::INDENT) { + column += 3; + } else if (type == TokenType::DEDENT) { + column -= 5; + } + Token token(type, curr, line, column); + tokens.push_back(token); + } else { + if (curr == "\"" || curr == "'") { + scanString(curr); + } + else if (std::isdigit(curr[0])) { + scanNumber(curr); + } else if (std::isalpha(curr[0])) { + scanIdentifier(curr); + } + } + } + return tokens; +} diff --git a/src/lexer/lexer.hpp b/src/lexer/lexer.hpp new file mode 100644 index 0000000..ae15f7a --- /dev/null +++ b/src/lexer/lexer.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include +#include //to check if char is alphanumeric + +#include "token.hpp" + +class Lexer { +public: + Lexer(std::string input_string); + std::vector scan_Tokens(); + + +private: + std::string source_code; + std::vector tokens; + int start = 0; + int current_index = 0; + int line = 1; + int column = 1; + + + // FUNCTIONS NEEDED FOR LEXER TO WORK + bool isAtEnd(); // Checks for last character + char advance(); // Return current char and move forward + char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext(); // Peak at the next character + + void addToken(TokenType type); + + // Specific scanners for complex types + void scanString(std::string first); + void scanNumber(std::string first); + void scanIdentifier(std::string first); + std::string preprocess_indents(std::string raw); +}; diff --git a/test.txt b/test.txt deleted file mode 100644 index 0bc00d4..0000000 --- a/test.txt +++ /dev/null @@ -1,6 +0,0 @@ -hi -hol -hu -hlo - -i am super xrazt \ No newline at end of file From 86eb761c500b6f98c98c478f6da4105fd64e8a8f Mon Sep 17 00:00:00 2001 From: AayushRanjan10 Date: Fri, 6 Mar 2026 16:28:26 +0530 Subject: [PATCH 18/23] empty commit to rerun the workflow --- src/lexer/lexer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index 02a2d6a..089b88d 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -110,3 +110,4 @@ std::vector Lexer::scan_Tokens() { } } } +// empty commit \ No newline at end of file From cda1bbe237f70e8dfacaf34b6ea2f2cf762c9082 Mon Sep 17 00:00:00 2001 From: AayushRanjan10 Date: Thu, 5 Mar 2026 23:53:12 +0530 Subject: [PATCH 19/23] Implemented floating point numbers, two-character operators --- src/common/token/token.hpp | 5 ++ src/lexer/keywords.hpp | 5 +- src/lexer/lexer.cpp | 96 +++++++++++++++++++++++++++++++++++++- 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/src/common/token/token.hpp b/src/common/token/token.hpp index 3cc4117..e80dbe5 100644 --- a/src/common/token/token.hpp +++ b/src/common/token/token.hpp @@ -37,6 +37,11 @@ enum class TokenType { GREATEREQUAL, // >= LESSEQUAL, // <= EQEQUAL, // == + NOTEQUAL, // != + PLUSEQUAL, // += + MINUSEQUAL, // -= + STAREQUAL, // *= + SLASHEQUAL, // /= LPAREN, // ( diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp index 66aa289..aa56bd1 100644 --- a/src/lexer/keywords.hpp +++ b/src/lexer/keywords.hpp @@ -29,7 +29,6 @@ inline const std::unordered_map keywords = { {",", TokenType::COMMA}, {"\n", TokenType::NEWLINE}, {"\t", TokenType::INDENT}, - {" ", TokenType::SPACE}, {"and", TokenType::AND}, {"or", TokenType::OR}, {"not", TokenType::NOT}, @@ -38,5 +37,7 @@ inline const std::unordered_map keywords = { {"False", TokenType::FALSE}, {"True", TokenType::TRUE}, {"None", TokenType::NONE}, - + {">", TokenType::GREATERTHAN}, + {"<", TokenType::LESSERTHAN}, + {"%", TokenType::MODULO}, }; diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index 089b88d..2b385c8 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -45,9 +45,24 @@ void Lexer::scanNumber(std::string curr) { while (std::isdigit(peek())) { curr += Lexer::advance(); } +<<<<<<< HEAD if (std::isalnum(peek())) { // throw an error } +======= + + if (peek() == '.' && std::isdigit(peekNext())) { + curr += advance(); + while (std::isdigit(peek())) { + curr += advance(); + } + } + + if (std::isalnum(peek()) || peek()=='_') { + // throw an error + } + +>>>>>>> c283021 (Implemented floating point numbers, two-character operators) Token token(TokenType::NUMBER, curr, line, start); tokens.push_back(token); } @@ -89,6 +104,73 @@ std::vector Lexer::scan_Tokens() { } std::string curr = ""; curr += advance(); + + if (curr == " ") { + continue; + } + + // handle two-character operators + if (curr == "=" && peek() == '=') { + advance(); + Token token(TokenType::EQEQUAL, "==", line, column-2); // using column-2 since we are advancing twice + tokens.push_back(token); + continue; + } + + if (curr == ">" && peek() == '=') { + advance(); + Token token(TokenType::GREATEREQUAL, ">=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "<" && peek() == '=') { + advance(); + Token token(TokenType::LESSEQUAL, "<=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "+" && peek() == '=') { + advance(); + Token token(TokenType::PLUSEQUAL, "+=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "-" && peek() == '=') { + advance(); + Token token(TokenType::MINUSEQUAL, "-=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "*" && peek() == '=') { + advance(); + Token token(TokenType::STAREQUAL, "*=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "/" && peek() == '=') { + advance(); + Token token(TokenType::SLASHEQUAL, "/=", line, column-2); + tokens.push_back(token); + continue; + } + + if (curr == "!") { + if (peek() == '=') { + advance(); + Token token(TokenType::NOTEQUAL, "!=", line, column-2); + tokens.push_back(token); + continue; + } + else { + // throw an error + } + } + if (keywords.count(curr)) { TokenType type = keywords.at(curr); if (type == TokenType::NEWLINE) { @@ -102,12 +184,24 @@ std::vector Lexer::scan_Tokens() { Token token(type, curr, line, column); tokens.push_back(token); } else { +<<<<<<< HEAD if (std::isdigit(curr[0])) { +======= + if (curr == "\"" || curr == "'") { + scanString(curr); + } + else if (std::isdigit(curr[0])) { +>>>>>>> c283021 (Implemented floating point numbers, two-character operators) scanNumber(curr); } else if (std::isalpha(curr[0])) { scanIdentifier(curr); } } } +<<<<<<< HEAD +} +// empty commit +======= + return tokens; } -// empty commit \ No newline at end of file +>>>>>>> c283021 (Implemented floating point numbers, two-character operators) From e9193a695c6ed0a4817ce66eb25aa3141f92c109 Mon Sep 17 00:00:00 2001 From: AayushRanjan10 Date: Fri, 6 Mar 2026 16:40:53 +0530 Subject: [PATCH 20/23] Resolved Merge conflicts in lexer.cpp --- src/lexer/lexer.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index 2b385c8..fd44757 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -45,11 +45,6 @@ void Lexer::scanNumber(std::string curr) { while (std::isdigit(peek())) { curr += Lexer::advance(); } -<<<<<<< HEAD - if (std::isalnum(peek())) { - // throw an error - } -======= if (peek() == '.' && std::isdigit(peekNext())) { curr += advance(); @@ -62,7 +57,6 @@ void Lexer::scanNumber(std::string curr) { // throw an error } ->>>>>>> c283021 (Implemented floating point numbers, two-character operators) Token token(TokenType::NUMBER, curr, line, start); tokens.push_back(token); } @@ -184,24 +178,15 @@ std::vector Lexer::scan_Tokens() { Token token(type, curr, line, column); tokens.push_back(token); } else { -<<<<<<< HEAD - if (std::isdigit(curr[0])) { -======= if (curr == "\"" || curr == "'") { scanString(curr); } else if (std::isdigit(curr[0])) { ->>>>>>> c283021 (Implemented floating point numbers, two-character operators) scanNumber(curr); } else if (std::isalpha(curr[0])) { scanIdentifier(curr); } } } -<<<<<<< HEAD -} -// empty commit -======= return tokens; } ->>>>>>> c283021 (Implemented floating point numbers, two-character operators) From fe165b0826c00af427f4a44d2a12580beb0dce3b Mon Sep 17 00:00:00 2001 From: Sujal Kumar Date: Fri, 6 Mar 2026 16:47:05 +0530 Subject: [PATCH 21/23] formatted code for new commits --- src/lexer/lexer.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index fd44757..5368bd8 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -53,7 +53,7 @@ void Lexer::scanNumber(std::string curr) { } } - if (std::isalnum(peek()) || peek()=='_') { + if (std::isalnum(peek()) || peek() == '_') { // throw an error } @@ -106,49 +106,50 @@ std::vector Lexer::scan_Tokens() { // handle two-character operators if (curr == "=" && peek() == '=') { advance(); - Token token(TokenType::EQEQUAL, "==", line, column-2); // using column-2 since we are advancing twice + Token token(TokenType::EQEQUAL, "==", line, + column - 2); // using column-2 since we are advancing twice tokens.push_back(token); continue; } if (curr == ">" && peek() == '=') { advance(); - Token token(TokenType::GREATEREQUAL, ">=", line, column-2); + Token token(TokenType::GREATEREQUAL, ">=", line, column - 2); tokens.push_back(token); continue; } if (curr == "<" && peek() == '=') { advance(); - Token token(TokenType::LESSEQUAL, "<=", line, column-2); + Token token(TokenType::LESSEQUAL, "<=", line, column - 2); tokens.push_back(token); continue; } if (curr == "+" && peek() == '=') { advance(); - Token token(TokenType::PLUSEQUAL, "+=", line, column-2); + Token token(TokenType::PLUSEQUAL, "+=", line, column - 2); tokens.push_back(token); continue; } if (curr == "-" && peek() == '=') { advance(); - Token token(TokenType::MINUSEQUAL, "-=", line, column-2); + Token token(TokenType::MINUSEQUAL, "-=", line, column - 2); tokens.push_back(token); continue; } if (curr == "*" && peek() == '=') { advance(); - Token token(TokenType::STAREQUAL, "*=", line, column-2); + Token token(TokenType::STAREQUAL, "*=", line, column - 2); tokens.push_back(token); continue; } if (curr == "/" && peek() == '=') { advance(); - Token token(TokenType::SLASHEQUAL, "/=", line, column-2); + Token token(TokenType::SLASHEQUAL, "/=", line, column - 2); tokens.push_back(token); continue; } @@ -156,15 +157,14 @@ std::vector Lexer::scan_Tokens() { if (curr == "!") { if (peek() == '=') { advance(); - Token token(TokenType::NOTEQUAL, "!=", line, column-2); + Token token(TokenType::NOTEQUAL, "!=", line, column - 2); tokens.push_back(token); continue; - } - else { + } else { // throw an error } } - + if (keywords.count(curr)) { TokenType type = keywords.at(curr); if (type == TokenType::NEWLINE) { @@ -180,8 +180,7 @@ std::vector Lexer::scan_Tokens() { } else { if (curr == "\"" || curr == "'") { scanString(curr); - } - else if (std::isdigit(curr[0])) { + } else if (std::isdigit(curr[0])) { scanNumber(curr); } else if (std::isalpha(curr[0])) { scanIdentifier(curr); From 21925ebabe507d5254d8593081e2927133ebee29 Mon Sep 17 00:00:00 2001 From: Sasank <213149805+sash070@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:53:45 +0000 Subject: [PATCH 22/23] Separate maps, indent stack, improved scanNumber --- .clang-format | 56 ++-- .github/workflows/c-cpp.yml | 50 ++-- .gitignore | 91 +++--- .vscode/settings.json | 6 +- LICENSE | 42 +-- Lexer/LexerClasses - Part 1.cpp | 286 +++++++++---------- Makefile | 94 +++---- README.md | 4 +- src/common/token/token.cpp | 10 +- src/common/token/token.hpp | 166 +++++------ src/lexer/keywords.hpp | 104 ++++--- src/lexer/lexer.cpp | 472 +++++++++++++++++++------------- src/lexer/lexer.hpp | 78 +++--- 13 files changed, 787 insertions(+), 672 deletions(-) diff --git a/.clang-format b/.clang-format index e7dbe41..6433a09 100644 --- a/.clang-format +++ b/.clang-format @@ -1,28 +1,28 @@ ---- -Language: Cpp -BasedOnStyle: Google - -# Basic Formatting -IndentWidth: 4 -ColumnLimit: 100 -TabWidth: 4 -UseTab: Never - -# Braces and Spacing -BreakBeforeBraces: Attach -AllowShortBlocksOnASingleLine: Empty -AllowShortFunctionsOnASingleLine: Empty -AllowShortIfStatementsOnASingleLine: Never -AllowShortLoopsOnASingleLine: false - -# Pointers and References (e.g., int* ptr instead of int *ptr) -PointerAlignment: Left - -# Alignment for readability -AlignConsecutiveAssignments: true -AlignConsecutiveDeclarations: false -AlignOperands: Align -AlignTrailingComments: true - -# Includes -SortIncludes: true +--- +Language: Cpp +BasedOnStyle: Google + +# Basic Formatting +IndentWidth: 4 +ColumnLimit: 100 +TabWidth: 4 +UseTab: Never + +# Braces and Spacing +BreakBeforeBraces: Attach +AllowShortBlocksOnASingleLine: Empty +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false + +# Pointers and References (e.g., int* ptr instead of int *ptr) +PointerAlignment: Left + +# Alignment for readability +AlignConsecutiveAssignments: true +AlignConsecutiveDeclarations: false +AlignOperands: Align +AlignTrailingComments: true + +# Includes +SortIncludes: true diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 4d16aa6..c40fd60 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -1,25 +1,25 @@ -name: C/C++ CI - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Install clang-format - run: | - sudo apt-get update - sudo apt-get install -y clang-format - - - name: Check Formatting - run: make check-format - - - name: Build project - run: make +name: C/C++ CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install clang-format + run: | + sudo apt-get update + sudo apt-get install -y clang-format + + - name: Check Formatting + run: make check-format + + - name: Build project + run: make diff --git a/.gitignore b/.gitignore index 80a6dbe..1722d8e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,44 +1,47 @@ -#build directory -build/ - -# Prerequisites -*.d - -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Linker files -*.ilk - -# Debugger Files -*.pdb - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Executables -*.exe -*.out -*.app - -# debug information files -*.dwo +#build directory +build/ + +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Linker files +*.ilk + +# Debugger Files +*.pdb + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# debug information files +*.dwo + +# vscode +.vscode/ diff --git a/.vscode/settings.json b/.vscode/settings.json index ad52559..64d7619 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ -{ - "makefile.configureOnOpen": true -} +{ + "makefile.configureOnOpen": true +} diff --git a/LICENSE b/LICENSE index 17601d9..85e3539 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2026 Sujal Kumar - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2026 Sujal Kumar + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Lexer/LexerClasses - Part 1.cpp b/Lexer/LexerClasses - Part 1.cpp index d7c1ae9..05f9a39 100644 --- a/Lexer/LexerClasses - Part 1.cpp +++ b/Lexer/LexerClasses - Part 1.cpp @@ -1,144 +1,144 @@ -#include -#include -#include - -enum class TokenType { - DEF, - RETURN, - IF, - ELSE, - PRINT, - FOR, - WHILE, - - IDENTIFIER, - NUMBER, - STRING, - - PLUS, // + - MINUS, // - - STAR, // * - SLASH, // / - ASSIGN, // = - MODULO, // % - GREATERTHAN, // > - LESSERTHAN, // < - - LPAREN, // ( - RPAREN, // ) - - NEWLINE, - EOF_TOKEN, - - COMMA, // , - COLON, // : - INDENT, - DEDENT, - COMMENT // # -}; - - -class Token{ - public: - // Main part of a token - TokenType type; - std::string value; - - // Used to tell user about error in case error is found - int line; - int column; - - // This constructor will allow us to easily make the tokens while coding - Token(TokenType type, std::string val, int l, int c){ - this->type = type; this->value = val; - this->line = l; this->column = c; - } -}; - - -class Lexer{ - public: - Lexer(std::string input_string){ - this->source_code = preprocess_indents(input_string); - } - std::vector scan_Tokens(); - - private: - std::string source_code; - static inline const std::unordered_map keywords = { - {"def", TokenType::DEF}, - {"return", TokenType::RETURN}, - {"if", TokenType::IF}, - {"else", TokenType::ELSE}, - {"print", TokenType::PRINT}, - {"=", TokenType::ASSIGN}, - {"+", TokenType::PLUS}, - {"-", TokenType::MINUS}, - {"*", TokenType::STAR}, - {"/", TokenType::SLASH}, - {"(", TokenType::LPAREN}, - {")", TokenType::RPAREN}, - {":", TokenType::COLON}, - {",", TokenType::COMMA}, - {"\n", TokenType::NEWLINE}, - {"\t", TokenType::INDENT}, - {"\r", TokenType::DEDENT} - }; - std::vector tokens; - int start = 0; - int current_index = 0; - int line = 1; - - - // FUNCTIONS NEEDED FOR LEXER TO WORK - - bool isAtEnd(); // Checks for last character - char advance(); // Return current char and move forward - char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it - char peekNext(); // Peak at the next character - - void addToken(TokenType type); - - // Specific scanners for complex types - void scanString(); - void scanNumber(); - void scanIdentifier(); - std::string preprocess_indents(std::string raw); -}; - - -std::string Lexer::preprocess_indents(std::string raw) { - std::string clean_code = ""; - bool atLineStart = true; - - for (int i = 0; i < raw.length(); i++) { - if (atLineStart) { - if (raw[i] == ' ') { - if (i + 3 < raw.length() && raw.substr(i, 4) == " ") { - clean_code += '\t'; - i += 3; - continue; - } - } - else if (raw[i] == '\t') { - clean_code += '\t'; - continue; - } - else if (raw[i] == '\n') { - clean_code += '\n'; - atLineStart = true; - continue; - } - else { - atLineStart = false; - } - } - clean_code += raw[i]; - - if (raw[i] == '\n') { - atLineStart = true; - } - } - - return clean_code; +#include +#include +#include + +enum class TokenType { + DEF, + RETURN, + IF, + ELSE, + PRINT, + FOR, + WHILE, + + IDENTIFIER, + NUMBER, + STRING, + + PLUS, // + + MINUS, // - + STAR, // * + SLASH, // / + ASSIGN, // = + MODULO, // % + GREATERTHAN, // > + LESSERTHAN, // < + + LPAREN, // ( + RPAREN, // ) + + NEWLINE, + EOF_TOKEN, + + COMMA, // , + COLON, // : + INDENT, + DEDENT, + COMMENT // # +}; + + +class Token{ + public: + // Main part of a token + TokenType type; + std::string value; + + // Used to tell user about error in case error is found + int line; + int column; + + // This constructor will allow us to easily make the tokens while coding + Token(TokenType type, std::string val, int l, int c){ + this->type = type; this->value = val; + this->line = l; this->column = c; + } +}; + + +class Lexer{ + public: + Lexer(std::string input_string){ + this->source_code = preprocess_indents(input_string); + } + std::vector scan_Tokens(); + + private: + std::string source_code; + static inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"if", TokenType::IF}, + {"else", TokenType::ELSE}, + {"print", TokenType::PRINT}, + {"=", TokenType::ASSIGN}, + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {":", TokenType::COLON}, + {",", TokenType::COMMA}, + {"\n", TokenType::NEWLINE}, + {"\t", TokenType::INDENT}, + {"\r", TokenType::DEDENT} + }; + std::vector tokens; + int start = 0; + int current_index = 0; + int line = 1; + + + // FUNCTIONS NEEDED FOR LEXER TO WORK + + bool isAtEnd(); // Checks for last character + char advance(); // Return current char and move forward + char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext(); // Peak at the next character + + void addToken(TokenType type); + + // Specific scanners for complex types + void scanString(); + void scanNumber(); + void scanIdentifier(); + std::string preprocess_indents(std::string raw); +}; + + +std::string Lexer::preprocess_indents(std::string raw) { + std::string clean_code = ""; + bool atLineStart = true; + + for (int i = 0; i < raw.length(); i++) { + if (atLineStart) { + if (raw[i] == ' ') { + if (i + 3 < raw.length() && raw.substr(i, 4) == " ") { + clean_code += '\t'; + i += 3; + continue; + } + } + else if (raw[i] == '\t') { + clean_code += '\t'; + continue; + } + else if (raw[i] == '\n') { + clean_code += '\n'; + atLineStart = true; + continue; + } + else { + atLineStart = false; + } + } + clean_code += raw[i]; + + if (raw[i] == '\n') { + atLineStart = true; + } + } + + return clean_code; } \ No newline at end of file diff --git a/Makefile b/Makefile index 8c8b5f8..071fef5 100644 --- a/Makefile +++ b/Makefile @@ -1,47 +1,47 @@ -#vars -TARGET_EXEC := executable -BUILD_DIR := build -SRC_DIR := src - -SRCS := $(shell find $(SRC_DIR) -name '*.cpp') -OBJS := $(SRCS:$(SRC_DIR)/%.cpp=$(BUILD_DIR)/%.o) -DEPS := $(OBJS:.o=.d) -INCS := $(shell find $(SRC_DIR) -type d) - -INC_FLAGS := $(addprefix -I,$(INCS)) -CPPFLAGS := $(INC_FLAGS) -MMD -MP -LDFLAGS := - -CXX := g++ - -#all -all: $(BUILD_DIR)/$(TARGET_EXEC) - -#executable dependencies -$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS) - @echo "Linking" - mkdir -p $(BUILD_DIR) - $(CXX) $(OBJS) -o $@ $(LDFLAGS) - -#object dependencies -$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp - @echo "Building dependencies" - mkdir -p $(dir $@) - $(CXX) $(CPPFLAGS) -c $< -o $@ - @echo - -#for change in header files --include $(DEPS) - -.PHONY: all clean run - -run: all - ./$(BUILD_DIR)/$(TARGET_EXEC) - -clean: - rm -rf $(BUILD_DIR) -format: - find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format -i - -check-format: - find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format --dry-run --Werror +#vars +TARGET_EXEC := executable +BUILD_DIR := build +SRC_DIR := src + +SRCS := $(shell find $(SRC_DIR) -name '*.cpp') +OBJS := $(SRCS:$(SRC_DIR)/%.cpp=$(BUILD_DIR)/%.o) +DEPS := $(OBJS:.o=.d) +INCS := $(shell find $(SRC_DIR) -type d) + +INC_FLAGS := $(addprefix -I,$(INCS)) +CPPFLAGS := $(INC_FLAGS) -MMD -MP +LDFLAGS := + +CXX := g++ + +#all +all: $(BUILD_DIR)/$(TARGET_EXEC) + +#executable dependencies +$(BUILD_DIR)/$(TARGET_EXEC): $(OBJS) + @echo "Linking" + mkdir -p $(BUILD_DIR) + $(CXX) $(OBJS) -o $@ $(LDFLAGS) + +#object dependencies +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp + @echo "Building dependencies" + mkdir -p $(dir $@) + $(CXX) $(CPPFLAGS) -c $< -o $@ + @echo + +#for change in header files +-include $(DEPS) + +.PHONY: all clean run + +run: all + ./$(BUILD_DIR)/$(TARGET_EXEC) + +clean: + rm -rf $(BUILD_DIR) +format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format -i + +check-format: + find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format --dry-run --Werror diff --git a/README.md b/README.md index ca0d79c..9a5e077 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# pydotcpp -Will add later +# pydotcpp +Will add later diff --git a/src/common/token/token.cpp b/src/common/token/token.cpp index b054974..02d1ee6 100644 --- a/src/common/token/token.cpp +++ b/src/common/token/token.cpp @@ -1,5 +1,5 @@ -#include -#include - -Token::Token(TokenType type, std::string val, int l, int c) - : type(type), value(val), line(l), column(c) {} +#include +#include + +Token::Token(TokenType type, std::string val, int l, int c) + : type(type), value(val), line(l), column(c) {} diff --git a/src/common/token/token.hpp b/src/common/token/token.hpp index e80dbe5..6af23e8 100644 --- a/src/common/token/token.hpp +++ b/src/common/token/token.hpp @@ -1,82 +1,84 @@ -#pragma once - -#include - -enum class TokenType { - DEF, - RETURN, - IF, - ELIF, - ELSE, - FOR, - WHILE, - BREAK, - CONTINUE, - PASS, - TRUE, - FALSE, - NONE, - AND, - OR, - NOT, - IN, - IS, - - IDENTIFIER, - NUMBER, - STRING, - - PLUS, // + - MINUS, // - - STAR, // * - SLASH, // / - ASSIGN, // = - MODULO, // % - GREATERTHAN, // > - LESSERTHAN, // < - GREATEREQUAL, // >= - LESSEQUAL, // <= - EQEQUAL, // == - NOTEQUAL, // != - PLUSEQUAL, // += - MINUSEQUAL, // -= - STAREQUAL, // *= - SLASHEQUAL, // /= - - - LPAREN, // ( - RPAREN, // ) - - NEWLINE, - EOF_TOKEN, - - COMMA, // , - COLON, // : - INDENT, - DEDENT, - COMMENT, // # - PRINT, - AMPERSAND, // & - PIPE, // | - SPACE, -}; - - -class Token { -public: - // Main part of a token - TokenType type; - std::string value; - - // Used to tell user about error in case error is found - int line; - int column; - - // This constructor will allow us to easily make the tokens while coding - Token(TokenType type, std::string val, int l, int c); -}; - - - - - +#pragma once + +#include + +enum class TokenType { + DEF, + RETURN, + IF, + ELIF, + ELSE, + FOR, + WHILE, + BREAK, + CONTINUE, + PASS, + TRUE, + FALSE, + NONE, + AND, + OR, + NOT, + IN, + IS, + + IDENTIFIER, + NUMBER, + STRING, + + PLUS, // + + MINUS, // - + STAR, // * + SLASH, // / + FLOORDIV, // // + POWER, // ** + ASSIGN, // = + MODULO, // % + GREATERTHAN, // > + LESSERTHAN, // < + GREATEREQUAL, // >= + LESSEQUAL, // <= + EQEQUAL, // == + NOTEQUAL, // != + PLUSEQUAL, // += + MINUSEQUAL, // -= + STAREQUAL, // *= + SLASHEQUAL, // /= + + + LPAREN, // ( + RPAREN, // ) + + NEWLINE, + EOF_TOKEN, + + COMMA, // , + COLON, // : + INDENT, + DEDENT, + COMMENT, // # + PRINT, + AMPERSAND, // & + PIPE, // | + SPACE, +}; + + +class Token { +public: + // Main part of a token + TokenType type; + std::string value; + + // Used to tell user about error in case error is found + int line; + int column; + + // This constructor will allow us to easily make the tokens while coding + Token(TokenType type, std::string val, int l, int c); +}; + + + + + diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp index aa56bd1..c268023 100644 --- a/src/lexer/keywords.hpp +++ b/src/lexer/keywords.hpp @@ -1,43 +1,61 @@ -#pragma once - -#include -#include -#include - -#include "token.hpp" - -inline const std::unordered_map keywords = { - {"def", TokenType::DEF}, - {"return", TokenType::RETURN}, - {"for", TokenType::FOR}, - {"while", TokenType::WHILE}, - {"break", TokenType::BREAK}, - {"continue",TokenType::CONTINUE}, - {"pass", TokenType::PASS}, - {"if", TokenType::IF}, - {"elif", TokenType::ELIF}, - {"else", TokenType::ELSE}, - {"print", TokenType::PRINT}, - {"=", TokenType::ASSIGN}, - {"+", TokenType::PLUS}, - {"-", TokenType::MINUS}, - {"*", TokenType::STAR}, - {"/", TokenType::SLASH}, - {"(", TokenType::LPAREN}, - {")", TokenType::RPAREN}, - {":", TokenType::COLON}, - {",", TokenType::COMMA}, - {"\n", TokenType::NEWLINE}, - {"\t", TokenType::INDENT}, - {"and", TokenType::AND}, - {"or", TokenType::OR}, - {"not", TokenType::NOT}, - {"in", TokenType::IN}, - {"is", TokenType::IS}, - {"False", TokenType::FALSE}, - {"True", TokenType::TRUE}, - {"None", TokenType::NONE}, - {">", TokenType::GREATERTHAN}, - {"<", TokenType::LESSERTHAN}, - {"%", TokenType::MODULO}, - }; +#pragma once + +#include +#include +#include + +#include "token.hpp" + +inline const std::unordered_map keywords = { + {"def", TokenType::DEF}, + {"return", TokenType::RETURN}, + {"if", TokenType::IF}, + {"elif", TokenType::ELIF}, + {"else", TokenType::ELSE}, + {"for", TokenType::FOR}, + {"while", TokenType::WHILE}, + {"break", TokenType::BREAK}, + {"continue", TokenType::CONTINUE}, + {"pass", TokenType::PASS}, + {"True", TokenType::TRUE}, + {"False", TokenType::FALSE}, + {"None", TokenType::NONE}, + {"and", TokenType::AND}, + {"or", TokenType::OR}, + {"not", TokenType::NOT}, + {"in", TokenType::IN}, + {"is", TokenType::IS}, + {"print", TokenType::PRINT}, +}; + +inline const std::unordered_map operators = { + {"+", TokenType::PLUS}, + {"-", TokenType::MINUS}, + {"*", TokenType::STAR}, + {"/", TokenType::SLASH}, + {"//", TokenType::FLOORDIV}, + {"**", TokenType::POWER}, + {"=", TokenType::ASSIGN}, + {"%", TokenType::MODULO}, + {">", TokenType::GREATERTHAN}, + {"<", TokenType::LESSERTHAN}, + {">=", TokenType::GREATEREQUAL}, + {"<=", TokenType::LESSEQUAL}, + {"==", TokenType::EQEQUAL}, + {"!=", TokenType::NOTEQUAL}, + {"+=", TokenType::PLUSEQUAL}, + {"-=", TokenType::MINUSEQUAL}, + {"*=", TokenType::STAREQUAL}, + {"/=", TokenType::SLASHEQUAL}, + {"&", TokenType::AMPERSAND}, + {"|", TokenType::PIPE}, +}; + +inline const std::unordered_map delimiters = { + {"(", TokenType::LPAREN}, + {")", TokenType::RPAREN}, + {",", TokenType::COMMA}, + {":", TokenType::COLON}, + {"#", TokenType::COMMENT}, + {"\n", TokenType::NEWLINE}, +}; diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index 5368bd8..6e86fef 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -1,191 +1,281 @@ -#include -#include -#include -#include -#include - -Lexer::Lexer(std::string input_string) : source_code(input_string) {} -bool Lexer::isAtEnd() { - if (current_index >= source_code.size()) { - return true; - } - return false; -} - -char Lexer::peek() { - if (current_index >= source_code.size()) { - return '\0'; - } - return source_code[current_index]; -} - -char Lexer::peekNext() { - if (current_index + 1 >= source_code.size()) { - return '\0'; - } - return source_code[current_index + 1]; -} - -char Lexer::advance() { - if (current_index >= source_code.size()) { - return '\0'; - } - char c = source_code[current_index++]; - if (c == '\n') { - line++; - column = 1; - } else { - column++; - } - return c; -} - -void Lexer::scanNumber(std::string curr) { - int start = column - 1; - while (std::isdigit(peek())) { - curr += Lexer::advance(); - } - - if (peek() == '.' && std::isdigit(peekNext())) { - curr += advance(); - while (std::isdigit(peek())) { - curr += advance(); - } - } - - if (std::isalnum(peek()) || peek() == '_') { - // throw an error - } - - Token token(TokenType::NUMBER, curr, line, start); - tokens.push_back(token); -} - -void Lexer::scanString(std::string quote) { - int start = column - 1; - while (!isAtEnd() && peek() != quote[0] && peek() != '\n') { - quote += advance(); - } - if (isAtEnd()) { - // throw an error - } else if (peek() == '\n') { - // throw an error - } else { - quote += advance(); - } - Token token(TokenType::STRING, quote, line, start); - tokens.push_back(token); -} - -void Lexer::scanIdentifier(std::string curr) { - int start = column - 1; - while (std::isalnum(peek()) || peek() == '_') { - curr += advance(); - } - Token token(TokenType::IDENTIFIER, curr, line, start); - if (keywords.count(curr)) { - token.type = keywords.at(curr); - } - tokens.push_back(token); -} - -std::vector Lexer::scan_Tokens() { - while (true) { - if (isAtEnd()) { - Token token(TokenType::EOF_TOKEN, "", line, column); - tokens.push_back(token); - break; - } - std::string curr = ""; - curr += advance(); - - if (curr == " ") { - continue; - } - - // handle two-character operators - if (curr == "=" && peek() == '=') { - advance(); - Token token(TokenType::EQEQUAL, "==", line, - column - 2); // using column-2 since we are advancing twice - tokens.push_back(token); - continue; - } - - if (curr == ">" && peek() == '=') { - advance(); - Token token(TokenType::GREATEREQUAL, ">=", line, column - 2); - tokens.push_back(token); - continue; - } - - if (curr == "<" && peek() == '=') { - advance(); - Token token(TokenType::LESSEQUAL, "<=", line, column - 2); - tokens.push_back(token); - continue; - } - - if (curr == "+" && peek() == '=') { - advance(); - Token token(TokenType::PLUSEQUAL, "+=", line, column - 2); - tokens.push_back(token); - continue; - } - - if (curr == "-" && peek() == '=') { - advance(); - Token token(TokenType::MINUSEQUAL, "-=", line, column - 2); - tokens.push_back(token); - continue; - } - - if (curr == "*" && peek() == '=') { - advance(); - Token token(TokenType::STAREQUAL, "*=", line, column - 2); - tokens.push_back(token); - continue; - } - - if (curr == "/" && peek() == '=') { - advance(); - Token token(TokenType::SLASHEQUAL, "/=", line, column - 2); - tokens.push_back(token); - continue; - } - - if (curr == "!") { - if (peek() == '=') { - advance(); - Token token(TokenType::NOTEQUAL, "!=", line, column - 2); - tokens.push_back(token); - continue; - } else { - // throw an error - } - } - - if (keywords.count(curr)) { - TokenType type = keywords.at(curr); - if (type == TokenType::NEWLINE) { - column = 0; - line++; - } else if (type == TokenType::INDENT) { - column += 3; - } else if (type == TokenType::DEDENT) { - column -= 5; - } - Token token(type, curr, line, column); - tokens.push_back(token); - } else { - if (curr == "\"" || curr == "'") { - scanString(curr); - } else if (std::isdigit(curr[0])) { - scanNumber(curr); - } else if (std::isalpha(curr[0])) { - scanIdentifier(curr); - } - } - } - return tokens; -} +#include // for exponentiation in scientific notation +#include +#include +#include +#include + +Lexer::Lexer(std::string input_string) : source_code(input_string) { + indent_stack.push(0); // stack initially must have 0 +} +bool Lexer::isAtEnd() const { // checks if we are at the end of source code + if (current_index >= source_code.size()) { + return true; + } + return false; +} + +char Lexer::peek() const { // returns character at current index + if (isAtEnd()) { + return '\0'; + } + return source_code[current_index]; +} + +char Lexer::peekNext() const { // returns character at next index + if (current_index + 1 >= source_code.size()) { + return '\0'; + } + return source_code[current_index + 1]; +} + +char Lexer::advance() { // returns character at current index and + if (isAtEnd()) { // makes necessary changes to line and column + return '\0'; + } + char c = source_code[current_index++]; + if (c == '\n') { + line++; + column = 1; + } else { + column++; + } + return c; +} + +void Lexer::processIndent() { // maintains an indent stack and adds indent and dedent tokens + // wherever necessary + int indent = 0; + + while (peek() == ' ' || peek() == '\t') { // handles spaces and tabs + switch (peek()) { + case '\t': + indent += 4; + break; + default: + indent += 1; + } + advance(); + } + if (peek() == '\n' || peek() == '\0' || peek() == '#') + return; // empty line, EOF, comments don't affect indentation + + if (indent > indent_stack.top()) { // current indent is larger; add indent token + indent_stack.push(indent); + Token token(TokenType::INDENT, "", line, 1); + tokens.push_back(token); + } else if (indent < indent_stack.top()) { // add dedent tokens until indentations match + while (!indent_stack.empty() && + indent_stack.top() != indent) { // current indent must be present elsewhere in the + indent_stack.pop(); // stack, else it is an error + Token token(TokenType::DEDENT, "", line, 1); + tokens.push_back(token); + } + if (indent_stack.empty()) { + // throw an error - invalid indentation + } + } +} + +void Lexer::scanNumber(std::string num) { + int start = column - 1; // column of the number token + + if (num == "0" && std::isdigit(peek())) { + // throw an error, leading zero + } + + while (std::isdigit(peek())) { + num += Lexer::advance(); + } + if (peek() == '.' && num[0] == '.') { + // throw an error - we can have .23 and 23.23, but .23.23 is error + } + if (peek() == '.' && std::isdigit(peekNext())) { // for floating point numbers + num += advance(); + while (std::isdigit(peek())) { + num += advance(); + } + } + + if ((peek() == 'e' || peek() == 'E')) { // scientific notation + advance(); + if (!std::isdigit(peek()) && peek() != '+' && peek() != '-') { + // throw an error - invalid syntax + } + std::string power = ""; // power can only be an integer in scientific notation in python + if (peek() == '+' || peek() == '-') { + if (!std::isdigit(peekNext())) { + // throw an error - invalid syntax + } + power += advance(); + } + while (std::isdigit(peek())) { + power += advance(); + } + num = std::to_string(std::stod(num) * + std::pow(10, std::stod(power))); // scientific -> decimal + } + + if (std::isalnum(peek()) || peek() == '_' || peek() == '.') { + // throw an error, invalid floating point number + } + + Token token(TokenType::NUMBER, num, line, start); + tokens.push_back(token); +} + +void Lexer::scanString(std::string str) { + int start = column - 1; + while (!isAtEnd() && peek() != str[0] && peek() != '\n') { + str += advance(); + } + if (isAtEnd()) { + // throw an error + } else if (peek() == '\n') { + // throw an error + } else { + str += advance(); + } + Token token(TokenType::STRING, str, line, start); + tokens.push_back(token); +} + +void Lexer::scanIdentifier(std::string identifier) { + int start = column - 1; + while (std::isalnum(peek()) || peek() == '_') { + identifier += advance(); + } + Token token(TokenType::IDENTIFIER, identifier, line, start); + if (keywords.count(identifier)) { + token.type = keywords.at(identifier); + } + tokens.push_back(token); +} + +std::vector Lexer::scan_Tokens() { + while (true) { + if (isAtEnd()) { + Token token(TokenType::EOF_TOKEN, "", line, column); + tokens.push_back(token); + break; + } + std::string curr = ""; + curr += advance(); + + if (curr == " " || curr == "\t") { + continue; + } + + // handle two-character operators + if (curr == "=" && peek() == '=') { + advance(); + Token token(TokenType::EQEQUAL, "==", line, + column - 2); // using column-2 since we are advancing twice + tokens.push_back(token); + continue; + } + + if (curr == ">" && peek() == '=') { + advance(); + Token token(TokenType::GREATEREQUAL, ">=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "<" && peek() == '=') { + advance(); + Token token(TokenType::LESSEQUAL, "<=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "+" && peek() == '=') { + advance(); + Token token(TokenType::PLUSEQUAL, "+=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "-" && peek() == '=') { + advance(); + Token token(TokenType::MINUSEQUAL, "-=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "*" && peek() == '=') { + advance(); + Token token(TokenType::STAREQUAL, "*=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "/" && peek() == '=') { + advance(); + Token token(TokenType::SLASHEQUAL, "/=", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "/" && peek() == '/') { + advance(); + Token token(TokenType::FLOORDIV, "//", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "*" && peek() == '*') { + advance(); + Token token(TokenType::POWER, "**", line, column - 2); + tokens.push_back(token); + continue; + } + + if (curr == "!") { + if (peek() == '=') { + advance(); + Token token(TokenType::NOTEQUAL, "!=", line, column - 2); + tokens.push_back(token); + continue; + } else { + // throw an error + } + } + + auto it = operators.find( + curr); // curr cannot be found in keywords at this point; it is a single character + if ((it = operators.find(curr)) != operators.end()) { + TokenType type = it->second; + Token token(type, curr, line, column); + tokens.push_back(token); + } else if ((it = delimiters.find(curr)) != delimiters.end()) { + TokenType type = it->second; + + if (type == TokenType::NEWLINE) { // add newline token, and process indentation + column = 1; + Token token(type, curr, line - 1, column); + tokens.push_back(token); + processIndent(); + } else { + Token token(type, curr, line, column); + tokens.push_back(token); + } + } else { + if (curr == "\"" || curr == "'") { + scanString(curr); + } else if (std::isdigit(curr[0]) || + (curr[0] == '.' && + std::isdigit(peek()))) { // .23 is also valid syntax in python + scanNumber(curr); + } else if (std::isalpha(curr[0]) || curr[0] == '_') { // _foo is also valid + scanIdentifier(curr); + } else if (curr[0] == '#') { // comment, ignore everything until newline or EOF + while (peek() != '\n' && peek() != '\0') { + advance(); + } + } else { + // throw an error - unexpected character + } + } + } + return tokens; +} diff --git a/src/lexer/lexer.hpp b/src/lexer/lexer.hpp index ae15f7a..78bbffb 100644 --- a/src/lexer/lexer.hpp +++ b/src/lexer/lexer.hpp @@ -1,38 +1,40 @@ -#pragma once - -#include -#include -#include -#include //to check if char is alphanumeric - -#include "token.hpp" - -class Lexer { -public: - Lexer(std::string input_string); - std::vector scan_Tokens(); - - -private: - std::string source_code; - std::vector tokens; - int start = 0; - int current_index = 0; - int line = 1; - int column = 1; - - - // FUNCTIONS NEEDED FOR LEXER TO WORK - bool isAtEnd(); // Checks for last character - char advance(); // Return current char and move forward - char peek(); // Sometimes, we don't actually want to read a character and may only want to peek at it - char peekNext(); // Peak at the next character - - void addToken(TokenType type); - - // Specific scanners for complex types - void scanString(std::string first); - void scanNumber(std::string first); - void scanIdentifier(std::string first); - std::string preprocess_indents(std::string raw); -}; +#pragma once + +#include +#include +#include +#include //to check if char is alphanumeric +#include + +#include "token.hpp" + +class Lexer { +public: + Lexer(std::string input_string); + std::vector scan_Tokens(); + + +private: + std::string source_code; + std::vector tokens; + int start = 0; + int current_index = 0; + int line = 1; + int column = 1; + std::stack indent_stack; + + + // FUNCTIONS NEEDED FOR LEXER TO WORK + bool isAtEnd() const; // Checks for last character + char advance(); // Return current char and move forward + char peek() const; // Sometimes, we don't actually want to read a character and may only want to peek at it + char peekNext() const; // Peak at the next character + + void addToken(TokenType type); + + // Specific scanners for complex types + void scanString(std::string first); + void scanNumber(std::string first); + void scanIdentifier(std::string first); + void processIndent(); // To process indents at the start of every line +}; From 6b72840908c497a0d5448b653e3ed518f382c0d2 Mon Sep 17 00:00:00 2001 From: Sasank <213149805+sash070@users.noreply.github.com> Date: Sat, 7 Mar 2026 12:15:56 +0000 Subject: [PATCH 23/23] Created tester for lexer, modified makefile to include testing --- Makefile | 33 +++++++++++++++++++++++++++ src/lexer/keywords.hpp | 4 ++-- src/lexer/lexer.cpp | 11 +++++---- src/lexer/lexer.hpp | 2 +- tests/lexer_test.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 92 insertions(+), 9 deletions(-) create mode 100644 tests/lexer_test.cpp diff --git a/Makefile b/Makefile index 071fef5..10d66db 100644 --- a/Makefile +++ b/Makefile @@ -2,12 +2,18 @@ TARGET_EXEC := executable BUILD_DIR := build SRC_DIR := src +TEST_DIR := tests SRCS := $(shell find $(SRC_DIR) -name '*.cpp') OBJS := $(SRCS:$(SRC_DIR)/%.cpp=$(BUILD_DIR)/%.o) DEPS := $(OBJS:.o=.d) INCS := $(shell find $(SRC_DIR) -type d) +LEXER_TEST := $(BUILD_DIR)/$(TEST_DIR)/lexer_test +PARSER_TEST := $(BUILD_DIR)/$(TEST_DIR)/parser_test +EVALUATOR_TEST := $(BUILD_DIR)/$(TEST_DIR)/evaluator_test +TEST_DEPS := $(LEXER_TEST).d $(PARSER_TEST).d + INC_FLAGS := $(addprefix -I,$(INCS)) CPPFLAGS := $(INC_FLAGS) -MMD -MP LDFLAGS := @@ -32,6 +38,7 @@ $(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp #for change in header files -include $(DEPS) +-include $(TEST_DEPS) .PHONY: all clean run @@ -45,3 +52,29 @@ format: check-format: find $(SRC_DIR)/ -name '*.cpp' -o -name '*.h' | xargs clang-format --dry-run --Werror + + +# separate tests for lexer, parser, evaluator +test-lexer: $(LEXER_TEST) + ./$(LEXER_TEST) + +test-parser: $(PARSER_TEST) + ./$(PARSER_TEST) + +test-evaluator: $(EVALUATOR_TEST) + ./$(EVALUATOR_TEST) + +# macro for executable and object dependencies for tests +define TEST_RULE +$(1): $(OBJS) $(1).o + mkdir -p $(BUILD_DIR)/$(TEST_DIR) + $(CXX) $(CPPFLAGS) $$^ -o $$@ + +$(1).o: $(TEST_DIR)/$(notdir $(1)).cpp + mkdir -p $(BUILD_DIR)/$(TEST_DIR) + $(CXX) $(CPPFLAGS) -c $$< -o $$@ +endef + +$(eval $(call TEST_RULE,$(LEXER_TEST))) +$(eval $(call TEST_RULE,$(PARSER_TEST))) +$(eval $(call TEST_RULE,$(EVALUATOR_TEST))) diff --git a/src/lexer/keywords.hpp b/src/lexer/keywords.hpp index c268023..460cf11 100644 --- a/src/lexer/keywords.hpp +++ b/src/lexer/keywords.hpp @@ -4,7 +4,7 @@ #include #include -#include "token.hpp" +#include "common/token/token.hpp" inline const std::unordered_map keywords = { {"def", TokenType::DEF}, @@ -56,6 +56,6 @@ inline const std::unordered_map delimiters = { {")", TokenType::RPAREN}, {",", TokenType::COMMA}, {":", TokenType::COLON}, - {"#", TokenType::COMMENT}, +// {"#", TokenType::COMMENT}, {"\n", TokenType::NEWLINE}, }; diff --git a/src/lexer/lexer.cpp b/src/lexer/lexer.cpp index 6e86fef..e9bccc9 100644 --- a/src/lexer/lexer.cpp +++ b/src/lexer/lexer.cpp @@ -97,22 +97,20 @@ void Lexer::scanNumber(std::string num) { } if ((peek() == 'e' || peek() == 'E')) { // scientific notation - advance(); + num += advance(); if (!std::isdigit(peek()) && peek() != '+' && peek() != '-') { // throw an error - invalid syntax } - std::string power = ""; // power can only be an integer in scientific notation in python + // power can only be an integer in scientific notation in python if (peek() == '+' || peek() == '-') { if (!std::isdigit(peekNext())) { // throw an error - invalid syntax } - power += advance(); + num += advance(); } while (std::isdigit(peek())) { - power += advance(); + num += advance(); } - num = std::to_string(std::stod(num) * - std::pow(10, std::stod(power))); // scientific -> decimal } if (std::isalnum(peek()) || peek() == '_' || peek() == '.') { @@ -269,6 +267,7 @@ std::vector Lexer::scan_Tokens() { } else if (std::isalpha(curr[0]) || curr[0] == '_') { // _foo is also valid scanIdentifier(curr); } else if (curr[0] == '#') { // comment, ignore everything until newline or EOF + curr = ""; while (peek() != '\n' && peek() != '\0') { advance(); } diff --git a/src/lexer/lexer.hpp b/src/lexer/lexer.hpp index 78bbffb..ad06881 100644 --- a/src/lexer/lexer.hpp +++ b/src/lexer/lexer.hpp @@ -6,7 +6,7 @@ #include //to check if char is alphanumeric #include -#include "token.hpp" +#include "common/token/token.hpp" class Lexer { public: diff --git a/tests/lexer_test.cpp b/tests/lexer_test.cpp new file mode 100644 index 0000000..9812da6 --- /dev/null +++ b/tests/lexer_test.cpp @@ -0,0 +1,51 @@ +#include +#include "lexer/lexer.hpp" + +int passed = 0; +int failed = 0; + +void test(std::string input_code, std::vector> expected) { + Lexer lexer(input_code); + std::vector tokens = lexer.scan_Tokens(); + + for (int i=0; i 0 ? 1 : 0; +}