From 5c4c7cd3f614965de23cc625b1cb72cd69f3b002 Mon Sep 17 00:00:00 2001
From: Zengtudor <zengtudor@outlook.com>
Date: Thu, 19 Sep 2024 22:27:40 +0800
Subject: [PATCH] update

---
 src/main.cpp        | 137 ++++++++++++++++++++++++++++++++++----------
 src/tools/tools.hpp |  10 ++++
 2 files changed, 117 insertions(+), 30 deletions(-)
diff --git a/src/main.cpp b/src/main.cpp
index 4a92d86..b763131 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -5,15 +5,18 @@
 #include <exception>
 #include <filesystem>
 #include<fstream>
+#include <limits>
 #include <sstream>
 #include <stdexcept>
 #include<algorithm>
 #include"tools.hpp" // 自己写的库，在src/tools/tools.hpp当中，注意要使用C++23标准编译
 #include <cstring>
 #include <stdio.h>
+#include <string>
 #include <string_view>
 #include <type_traits>
 #include <unordered_map>
+#include <vector>
 
 
 // 这两个宏用来申请读入和读出流，实现反射并输出日志，获取申请流的变量名字
@@ -21,10 +24,11 @@
 #define OPEN_IFS_AND_CHECK(file_path,value_name)std::ifstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open input file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open input file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");}
 
 #define OPEN_OFS_AND_CHECK(file_path,value_name)std::ofstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open output file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open output file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");}
-//最大DNA序列长度
-const size_t MAX_SIZE = 5e4+5;
 
-void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里使用引用DNA sequence，避免拷贝开销
+//最大DNA序列长度
+const size_t MAX_SIZE_PER_DNA = 5e4+5;
+
+void reverseComplement(char *begin, char *end) //注意这里使用引用DNA sequence，避免拷贝开销
 {
     static const std::unordered_map<char, char> complement = { //这里使用查表的方式大大提高CPU速度，因为if分支CPU不容易命中缓存，需要使用查表加速
         {'A', 'T'}, {'a', 'T'},
@@ -33,12 +37,12 @@ void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里
         {'G', 'C'}, {'g', 'C'}
     };
 
-    std::reverse(DNAsequence.begin(), DNAsequence.begin() + buf_size); //翻转DNA序列
+    std::reverse(begin, end); //翻转DNA序列
     
-    for (std::remove_const_t<decltype(buf_size)> i = 0; i < buf_size; ++i) { //std::remove_const_t<decltype(buf_size)>意思是和buf_size相同的类型并去掉const
-        auto it = complement.find(DNAsequence[i]);//查表并替换
+    for (std::remove_const_t<decltype(begin)> i = begin; i < end; ++i) { //std::remove_const_t<decltype(buf_size)>意思是和buf_size相同的类型并去掉const
+        auto it = complement.find(*i);//查表并替换
         if (it != complement.end()) [[likely]] {
-            DNAsequence[i] = it->second;
+            *i = it->second;
         }
     }
 }
@@ -47,9 +51,9 @@ void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里
 class Spent{ // 使用RAII原理的自动计时器，计算主函数运行时间，析构时自动输出
 private:
     const decltype(std::chrono::system_clock::now()) start;
-    const std::string_view name;
+    const std::string name;
 public:
-    Spent(const std::string_view name)noexcept:start(std::chrono::system_clock::now()),name(name){
+    Spent(const std::string name)noexcept:start(std::chrono::system_clock::now()),name(name){
         zt::print("[Timer: ",name,"]"," Start timing","\n");
     }
     ~Spent()noexcept{
@@ -64,47 +68,120 @@ int main()
 	try{
         //std::ios_base::sync_with_stdio(false); //加了没效果 //这里直接关掉就行了，不会影响读入，因为目前是一次性读入。开了反而会让日志输出变成全缓冲，不友好
         // using namespace std; // 别加，刚被坑了
-
-        Spent all_spent("All spent"); //自动计时器，给主函数计时
         
-        std::array<char,MAX_SIZE> buf;
+        // std::array<char,MAX_SIZE> buf;
 
         bool lines = 0; //使用布尔值加速
 
+        const auto get_lines_add = [&lines]() {
+            bool old_value = lines;  // 保存旧值
+            lines = !lines;          // 改变布尔值
+            return old_value;        // 返回旧值
+        };
+
+
         std::filesystem::path input_path("filteredReads.txt"),output_path("reversedSequence.txt");
         
         OPEN_IFS_AND_CHECK(input_path, input_file_stream) //创建输入和输出流
         OPEN_OFS_AND_CHECK(output_path, output_file_stream)
 
-        while (input_file_stream.getline(buf.data(),MAX_SIZE,'\n'))
-        {
-            // lines=!lines; //防止溢出
-            const auto buf_len = strlen(buf.data());
-            const std::string_view suffix("\n"); //设置一个每个DNA序列结尾的字符，这里是以\n换行来结尾
-            if (lines == true){
-                // output_file_stream << reverseComplement(buf) << endl;
-                reverseComplement(buf,buf_len);
-            }
-            // buf+=suffix;
-            for(std::remove_const_t<decltype(suffix.size())> i=0;i<suffix.size();i++){
-                buf[buf_len+i] = suffix[i];
-            }
-            output_file_stream.write(buf.data(), buf_len+suffix.size()); // 写入文件
-            lines=!lines; //bool取反
-        }
+        const size_t BUF_SIZE  = (size_t)4 * 1024 * 1024 *1024; //4GB + 区块大小一点冗余 ///////////////////////////设置区块大小
+        // const size_t BUF_SIZE  = (size_t)400*1024*1024; //4GB + 一点冗余 // 测试用
+
+        std::vector<char> buf(BUF_SIZE); // 堆上分配可以大一点
+        std::array<char, MAX_SIZE_PER_DNA> tmp_buf;//用于处理截断的DNA，直接在栈上申请
         
-        return 0;
+        Spent all_spent("All spent"); //自动计时器，给主函数计时
+        unsigned int chunk_id = 0;
+        size_t last_buf_size = 0;
+        while (input_file_stream.eof()==false)
+        {
+            Spent chunk_spent(zt::fmt("chunk_id:[",++chunk_id,"]"));
+            {
+                Spent chunk_read_spent(zt::fmt("read_chunk_id:[",chunk_id,"]"));
+                input_file_stream.read(buf.data(),buf.size());
+            }
+            // lines=!lines; //防止溢出
+            const auto buf_len = input_file_stream.gcount();
+
+            zt::print(NAME_VALUE(buf_len),"\n");
+            
+            if(buf_len == std::numeric_limits<decltype(buf_len)>::max())[[unlikely]]{
+                THROW_RT_ERROR("get input file stream read buf size failed\n")
+            }
+
+            if(buf_len == 0)[[unlikely]]{
+                break;
+            }
+
+            const std::string_view buf_str_v(buf.data(),buf_len); //string_view是零拷贝，但是要注意悬垂引用
+
+            size_t start_pos = 0;
+            size_t end_pos = 0;
+
+            if(last_buf_size>0)[[likely]]{
+                Spent recovery_interrupt_spent(zt::fmt("recovery_interrupt [",chunk_id,"]"));
+                if((end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos)[[likely]]{
+                    std::memcpy(tmp_buf.data()+last_buf_size,buf.data(),end_pos+1);
+                    if(get_lines_add()){
+                        reverseComplement(tmp_buf.data(), tmp_buf.data()+last_buf_size+end_pos);
+                    }
+                    // lines=!lines;
+                    output_file_stream.write(tmp_buf.data(), last_buf_size+end_pos+1);
+                }else{
+                    THROW_RT_ERROR("DNA incompleteness")
+                }
+                last_buf_size=0;
+            }
+
+            {
+                Spent calculate_spent(zt::fmt("calculate_chunk_id:[",chunk_id,"]"));
+
+                while((end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos){
+                    if(get_lines_add()){
+                        reverseComplement(buf.data()+start_pos, buf.data()+end_pos);
+                    }
+                    // lines=!lines;
+                    start_pos=end_pos+1;
+                }
+            }
+            
+            if(start_pos!=buf_len){
+                zt::print("Saving interrupt chunk_id[",chunk_id,"]\n");
+                std::memcpy(tmp_buf.data(),buf.data()+start_pos+1,(last_buf_size = buf_len-start_pos-1));
+            }
+            {
+                Spent chunk_write_spent(zt::fmt("write_chunk_id:[",chunk_id,"] , ","[Wrote bytes] ",NAME_VALUE(start_pos)));
+                output_file_stream.write(buf.data(), start_pos);
+            }
+            // zt::print("[Wrote bytes] ",NAME_VALUE(start_pos),"\n");
+            // const std::string_view suffix("\n"); //设置一个每个DNA序列结尾的字符，这里是以\n换行来结尾
+
+            
+            // if (lines == true){
+            //     // output_file_stream << reverseComplement(buf) << endl;
+            //     reverseComplement(buf,buf_len);
+            // }
+            // // buf+=suffix;
+            // for(std::remove_const_t<decltype(suffix.size())> i=0;i<suffix.size();i++){
+            //     buf[buf_len+i] = suffix[i];
+            // }
+            // output_file_stream.write(buf.data(), buf_len+suffix.size()); // 写入文件
+            
+        }
+
     }catch(const std::exception &e){
         zt::eprint(
             "Caught an error because:\n",
             "\t",NAME_VALUE(e.what()),"\n"
             "Closing\n"
         );
+        throw  e;
     }catch(...){
         zt::eprint(
             "Caught an unknown error :\n",
             "Closing\n"
         );
+        throw;
     }
-    return -1;
 }
diff --git a/src/tools/tools.hpp b/src/tools/tools.hpp
index 6e80d6d..1462923 100644
--- a/src/tools/tools.hpp
+++ b/src/tools/tools.hpp
@@ -3,6 +3,8 @@
 #include <iostream>
 #include <ostream>
 #include <sstream>
+#include <string>
+#include <utility>
 
 #define NAME_VALUE(v)#v," : ",(v)
 
@@ -70,6 +72,14 @@ namespace zt {
         return;
     }
 
+    template<class ...Args>
+    inline std::string fmt(Args&&... args) {
+        std::ostringstream oss;
+        (oss << ... << std::forward<Args>(args));
+        return oss.str();
+    }
+
+    #define THROW_RT_ERROR(why)throw std::runtime_error(zt::fmt("[FILE:",__FILE__,"] [LINE:",__LINE__,"] why:",why));
     // template <class ...Args>
     // inline void check_fstream_isopen(const Args&...args)noexcept(false){
     //     bool is_open=true;