TudorLang/include/Lexer.hpp
2025-07-12 14:14:54 +08:00

212 lines
7.1 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <cstddef>
#include <functional>
#include <memory>
#include <ostream>
#include <stdexcept>
#include <string>
#include <type_traits>
#include <vector>
#include <cxxabi.h>
#include "Tools.hpp"
namespace ztl{
namespace lexer{
inline std::string demangle(const char* mangled_name) {
int status = 0;
// 调用 __cxa_demangle 解 mangling
char* demangled = abi::__cxa_demangle(
mangled_name,
nullptr, // 输出缓冲区nullptr 表示自动分配)
nullptr, // 缓冲区大小nullptr 表示自动计算)
&status // 输出状态0 表示成功)
);
// 用智能指针管理动态分配的内存,避免泄漏
std::unique_ptr<char, decltype(&std::free)> demangled_ptr(demangled, std::free);
if (status == 0) {
return std::string(demangled_ptr.get()); // 解 mangling 成功
} else {
return std::string(mangled_name); // 失败时返回原始名称
}
}
struct Token{
std::string data;
// std::string got_token_type() const {
// return demangle(typeid(*this).name());
// }
};
struct Keywords:Token{};
struct Identifiers:Token{};
struct Literals:Token{};
struct Operators:Token{};
struct Separators:Token{};
struct Comments:Token{};
struct EndSeparators:Token{};
struct EOF_Token:Token{};
const std::vector<char> white_spaces{'\n','\t',' '};
const std::vector<char> symbols{'(',')','"'};
struct CharProvider{
std::string buffer;
size_t idx=0;
Result<char> peek(){
if (idx<buffer.size()) {
return Result<char>(buffer[idx]);
}else{
return Result<char>(Err("got EOF"));
}
}
Result<char> consume(){
return ztl::match(peek(), [this](char c){
idx++;
return Result<char>(c);
},[](const Err &e){
return Result<char>(e);
});
}
Result<bool> match(char c){
return ztl::match(peek(),[this](char c){
if(c==buffer[idx]){
idx++;
return Result<bool>(true);
}else{
return Result<bool>(false);
}
},[this](const Err &e){
return Result<bool>(e);
});
}
Result<std::string> consume_until(std::function<bool(char)> func){
size_t start_idx = idx;
while(idx<buffer.size()&&func(buffer[idx])){
idx++;
}
if(idx-start_idx==0){
return Result<std::string>(Err("don't hava any string"));
}
idx++;
return Result<std::string>(buffer.substr(start_idx,idx-start_idx-1));
}
void skip_whitespace(){
while(1) {
if(
!ztl::match(peek(),[this](char c){
bool is_ws = false;
for(char ws:white_spaces){
if(c==ws){
consume();
is_ws=true;
}
}
return is_ws;
},[](const Err &e){
return false;
})
){
return;
}
}
}
};
inline bool is_digit(char c){
return '0'<=c&&c<='9';
}
inline bool is_alpha(char c){
return 'A'<=c&&c<='z';
}
inline bool is_alphanumeric(char c){
return is_digit(c) || is_alpha(c);
}
inline bool is_symbol(char c){
bool is_symbol = false;
for(char i:symbols){
if(c==i){
is_symbol=true;
break;
}
}
return is_symbol;
}
inline std::vector<Token> lexer(const std::string&s){
CharProvider cp(s);
std::vector<Token> tokens;
while(1){
bool should_break=false;
cp.skip_whitespace();
match(cp.peek(),[&cp,&tokens](char c){
if (is_alpha(c)){
tokens.push_back(Keywords{cp.consume_until(is_alphanumeric).unwrap()});
}else if(c=='('){
tokens.push_back(Separators{std::string()+cp.consume().unwrap()});
match(cp.match(')'),[&cp](bool matched){
if(!matched){
throw std::runtime_error("unmatched '('");
}
},[](const Err &e){
throw std::runtime_error(e);
});
}else if(c=='"'){
cp.consume().unwrap();
tokens.push_back(Literals{cp.consume_until([](char c)->bool{return c!='"';}).unwrap()});
match(cp.match('"'),[&cp](bool matched){
if(!matched){
throw std::runtime_error("unmatched '\"'");
}
},[](const Err &e){
throw std::runtime_error(e);
});
}else if(c==';'){
tokens.push_back(EndSeparators{std::string()+cp.consume().unwrap()});
}
else{
throw std::runtime_error(std::string("unknown token named :'")+c+"'");
}
},[&should_break](const Err &e){
should_break=true;
});
if(should_break)break;
}
return tokens;
}
}
}
namespace std {
template <typename T>
concept DerivedFromToken = std::is_base_of_v<ztl::lexer::Token, T>;
template<DerivedFromToken T>
std::ostream&operator<<(std::ostream &os, const T &t){
string name = ztl::get_T_name<T>();
// ztl::logger(ztl::demangle(typeid(t).name()));
// string name = ztl::demangle(typeid(t).name());
return os<<name<<" {data: '"<<t.data<<"'}";
}
template<class T>
std::ostream&operator<<(std::ostream &os, const std::vector<T> &v){
os<<"std::vector<"<<ztl::get_T_name<T>()<<"> "<<"{";
if(v.size()>0){
for(size_t i=0;i<v.size()-1;i++){
os<<" "<<v[i]<<",";
}
os<<" "<<v[v.size()-1];
}else{
os<<"}";
return os;
}
os<<"}";
return os;
}
}