From 5c4c7cd3f614965de23cc625b1cb72cd69f3b002 Mon Sep 17 00:00:00 2001 From: Zengtudor Date: Thu, 19 Sep 2024 22:27:40 +0800 Subject: [PATCH] update --- src/main.cpp | 137 ++++++++++++++++++++++++++++++++++---------- src/tools/tools.hpp | 10 ++++ 2 files changed, 117 insertions(+), 30 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 4a92d86..b763131 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,15 +5,18 @@ #include #include #include +#include #include #include #include #include"tools.hpp" // 自己写的库,在src/tools/tools.hpp当中,注意要使用C++23标准编译 #include #include +#include #include #include #include +#include // 这两个宏用来申请读入和读出流,实现反射并输出日志,获取申请流的变量名字 @@ -21,10 +24,11 @@ #define OPEN_IFS_AND_CHECK(file_path,value_name)std::ifstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open input file stream : "< complement = { //这里使用查表的方式大大提高CPU速度,因为if分支CPU不容易命中缓存,需要使用查表加速 {'A', 'T'}, {'a', 'T'}, @@ -33,12 +37,12 @@ void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里 {'G', 'C'}, {'g', 'C'} }; - std::reverse(DNAsequence.begin(), DNAsequence.begin() + buf_size); //翻转DNA序列 + std::reverse(begin, end); //翻转DNA序列 - for (std::remove_const_t i = 0; i < buf_size; ++i) { //std::remove_const_t意思是和buf_size相同的类型并去掉const - auto it = complement.find(DNAsequence[i]);//查表并替换 + for (std::remove_const_t i = begin; i < end; ++i) { //std::remove_const_t意思是和buf_size相同的类型并去掉const + auto it = complement.find(*i);//查表并替换 if (it != complement.end()) [[likely]] { - DNAsequence[i] = it->second; + *i = it->second; } } } @@ -47,9 +51,9 @@ void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里 class Spent{ // 使用RAII原理的自动计时器,计算主函数运行时间,析构时自动输出 private: const decltype(std::chrono::system_clock::now()) start; - const std::string_view name; + const std::string name; public: - Spent(const std::string_view name)noexcept:start(std::chrono::system_clock::now()),name(name){ + Spent(const std::string name)noexcept:start(std::chrono::system_clock::now()),name(name){ zt::print("[Timer: ",name,"]"," Start timing","\n"); } ~Spent()noexcept{ @@ -64,47 +68,120 @@ int main() try{ //std::ios_base::sync_with_stdio(false); //加了没效果 //这里直接关掉就行了,不会影响读入,因为目前是一次性读入。开了反而会让日志输出变成全缓冲,不友好 // using namespace std; // 别加,刚被坑了 - - Spent all_spent("All spent"); //自动计时器,给主函数计时 - std::array buf; + // std::array buf; bool lines = 0; //使用布尔值加速 + const auto get_lines_add = [&lines]() { + bool old_value = lines; // 保存旧值 + lines = !lines; // 改变布尔值 + return old_value; // 返回旧值 + }; + + std::filesystem::path input_path("filteredReads.txt"),output_path("reversedSequence.txt"); OPEN_IFS_AND_CHECK(input_path, input_file_stream) //创建输入和输出流 OPEN_OFS_AND_CHECK(output_path, output_file_stream) - while (input_file_stream.getline(buf.data(),MAX_SIZE,'\n')) - { - // lines=!lines; //防止溢出 - const auto buf_len = strlen(buf.data()); - const std::string_view suffix("\n"); //设置一个每个DNA序列结尾的字符,这里是以\n换行来结尾 - if (lines == true){ - // output_file_stream << reverseComplement(buf) << endl; - reverseComplement(buf,buf_len); - } - // buf+=suffix; - for(std::remove_const_t i=0;i buf(BUF_SIZE); // 堆上分配可以大一点 + std::array tmp_buf;//用于处理截断的DNA,直接在栈上申请 - return 0; + Spent all_spent("All spent"); //自动计时器,给主函数计时 + unsigned int chunk_id = 0; + size_t last_buf_size = 0; + while (input_file_stream.eof()==false) + { + Spent chunk_spent(zt::fmt("chunk_id:[",++chunk_id,"]")); + { + Spent chunk_read_spent(zt::fmt("read_chunk_id:[",chunk_id,"]")); + input_file_stream.read(buf.data(),buf.size()); + } + // lines=!lines; //防止溢出 + const auto buf_len = input_file_stream.gcount(); + + zt::print(NAME_VALUE(buf_len),"\n"); + + if(buf_len == std::numeric_limits::max())[[unlikely]]{ + THROW_RT_ERROR("get input file stream read buf size failed\n") + } + + if(buf_len == 0)[[unlikely]]{ + break; + } + + const std::string_view buf_str_v(buf.data(),buf_len); //string_view是零拷贝,但是要注意悬垂引用 + + size_t start_pos = 0; + size_t end_pos = 0; + + if(last_buf_size>0)[[likely]]{ + Spent recovery_interrupt_spent(zt::fmt("recovery_interrupt [",chunk_id,"]")); + if((end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos)[[likely]]{ + std::memcpy(tmp_buf.data()+last_buf_size,buf.data(),end_pos+1); + if(get_lines_add()){ + reverseComplement(tmp_buf.data(), tmp_buf.data()+last_buf_size+end_pos); + } + // lines=!lines; + output_file_stream.write(tmp_buf.data(), last_buf_size+end_pos+1); + }else{ + THROW_RT_ERROR("DNA incompleteness") + } + last_buf_size=0; + } + + { + Spent calculate_spent(zt::fmt("calculate_chunk_id:[",chunk_id,"]")); + + while((end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos){ + if(get_lines_add()){ + reverseComplement(buf.data()+start_pos, buf.data()+end_pos); + } + // lines=!lines; + start_pos=end_pos+1; + } + } + + if(start_pos!=buf_len){ + zt::print("Saving interrupt chunk_id[",chunk_id,"]\n"); + std::memcpy(tmp_buf.data(),buf.data()+start_pos+1,(last_buf_size = buf_len-start_pos-1)); + } + { + Spent chunk_write_spent(zt::fmt("write_chunk_id:[",chunk_id,"] , ","[Wrote bytes] ",NAME_VALUE(start_pos))); + output_file_stream.write(buf.data(), start_pos); + } + // zt::print("[Wrote bytes] ",NAME_VALUE(start_pos),"\n"); + // const std::string_view suffix("\n"); //设置一个每个DNA序列结尾的字符,这里是以\n换行来结尾 + + + // if (lines == true){ + // // output_file_stream << reverseComplement(buf) << endl; + // reverseComplement(buf,buf_len); + // } + // // buf+=suffix; + // for(std::remove_const_t i=0;i #include #include +#include +#include #define NAME_VALUE(v)#v," : ",(v) @@ -70,6 +72,14 @@ namespace zt { return; } + template + inline std::string fmt(Args&&... args) { + std::ostringstream oss; + (oss << ... << std::forward(args)); + return oss.str(); + } + + #define THROW_RT_ERROR(why)throw std::runtime_error(zt::fmt("[FILE:",__FILE__,"] [LINE:",__LINE__,"] why:",why)); // template // inline void check_fstream_isopen(const Args&...args)noexcept(false){ // bool is_open=true;