update
This commit is contained in:
		
							parent
							
								
									e25f75fe53
								
							
						
					
					
						commit
						5c4c7cd3f6
					
				
							
								
								
									
										127
									
								
								src/main.cpp
									
									
									
									
									
								
							
							
						
						
									
										127
									
								
								src/main.cpp
									
									
									
									
									
								
							| @ -5,15 +5,18 @@ | |||||||
| #include <exception> | #include <exception> | ||||||
| #include <filesystem> | #include <filesystem> | ||||||
| #include<fstream> | #include<fstream> | ||||||
|  | #include <limits> | ||||||
| #include <sstream> | #include <sstream> | ||||||
| #include <stdexcept> | #include <stdexcept> | ||||||
| #include<algorithm> | #include<algorithm> | ||||||
| #include"tools.hpp" // 自己写的库,在src/tools/tools.hpp当中,注意要使用C++23标准编译
 | #include"tools.hpp" // 自己写的库,在src/tools/tools.hpp当中,注意要使用C++23标准编译
 | ||||||
| #include <cstring> | #include <cstring> | ||||||
| #include <stdio.h> | #include <stdio.h> | ||||||
|  | #include <string> | ||||||
| #include <string_view> | #include <string_view> | ||||||
| #include <type_traits> | #include <type_traits> | ||||||
| #include <unordered_map> | #include <unordered_map> | ||||||
|  | #include <vector> | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| // 这两个宏用来申请读入和读出流,实现反射并输出日志,获取申请流的变量名字
 | // 这两个宏用来申请读入和读出流,实现反射并输出日志,获取申请流的变量名字
 | ||||||
| @ -21,10 +24,11 @@ | |||||||
| #define OPEN_IFS_AND_CHECK(file_path,value_name)std::ifstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open input file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open input file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");} | #define OPEN_IFS_AND_CHECK(file_path,value_name)std::ifstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open input file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open input file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");} | ||||||
| 
 | 
 | ||||||
| #define OPEN_OFS_AND_CHECK(file_path,value_name)std::ofstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open output file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open output file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");} | #define OPEN_OFS_AND_CHECK(file_path,value_name)std::ofstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open output file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open output file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");} | ||||||
| //最大DNA序列长度
 |  | ||||||
| const size_t MAX_SIZE = 5e4+5; |  | ||||||
| 
 | 
 | ||||||
| void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里使用引用DNA sequence,避免拷贝开销
 | //最大DNA序列长度
 | ||||||
|  | const size_t MAX_SIZE_PER_DNA = 5e4+5; | ||||||
|  | 
 | ||||||
|  | void reverseComplement(char *begin, char *end) //注意这里使用引用DNA sequence,避免拷贝开销
 | ||||||
| { | { | ||||||
|     static const std::unordered_map<char, char> complement = { //这里使用查表的方式大大提高CPU速度,因为if分支CPU不容易命中缓存,需要使用查表加速
 |     static const std::unordered_map<char, char> complement = { //这里使用查表的方式大大提高CPU速度,因为if分支CPU不容易命中缓存,需要使用查表加速
 | ||||||
|         {'A', 'T'}, {'a', 'T'}, |         {'A', 'T'}, {'a', 'T'}, | ||||||
| @ -33,12 +37,12 @@ void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里 | |||||||
|         {'G', 'C'}, {'g', 'C'} |         {'G', 'C'}, {'g', 'C'} | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     std::reverse(DNAsequence.begin(), DNAsequence.begin() + buf_size); //翻转DNA序列
 |     std::reverse(begin, end); //翻转DNA序列
 | ||||||
|      |      | ||||||
|     for (std::remove_const_t<decltype(buf_size)> i = 0; i < buf_size; ++i) { //std::remove_const_t<decltype(buf_size)>意思是和buf_size相同的类型并去掉const
 |     for (std::remove_const_t<decltype(begin)> i = begin; i < end; ++i) { //std::remove_const_t<decltype(buf_size)>意思是和buf_size相同的类型并去掉const
 | ||||||
|         auto it = complement.find(DNAsequence[i]);//查表并替换
 |         auto it = complement.find(*i);//查表并替换
 | ||||||
|         if (it != complement.end()) [[likely]] { |         if (it != complement.end()) [[likely]] { | ||||||
|             DNAsequence[i] = it->second; |             *i = it->second; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @ -47,9 +51,9 @@ void reverseComplement(auto &DNAsequence, const size_t buf_size) //注意这里 | |||||||
| class Spent{ // 使用RAII原理的自动计时器,计算主函数运行时间,析构时自动输出
 | class Spent{ // 使用RAII原理的自动计时器,计算主函数运行时间,析构时自动输出
 | ||||||
| private: | private: | ||||||
|     const decltype(std::chrono::system_clock::now()) start; |     const decltype(std::chrono::system_clock::now()) start; | ||||||
|     const std::string_view name; |     const std::string name; | ||||||
| public: | public: | ||||||
|     Spent(const std::string_view name)noexcept:start(std::chrono::system_clock::now()),name(name){ |     Spent(const std::string name)noexcept:start(std::chrono::system_clock::now()),name(name){ | ||||||
|         zt::print("[Timer: ",name,"]"," Start timing","\n"); |         zt::print("[Timer: ",name,"]"," Start timing","\n"); | ||||||
|     } |     } | ||||||
|     ~Spent()noexcept{ |     ~Spent()noexcept{ | ||||||
| @ -65,46 +69,119 @@ int main() | |||||||
|         //std::ios_base::sync_with_stdio(false); //加了没效果 //这里直接关掉就行了,不会影响读入,因为目前是一次性读入。开了反而会让日志输出变成全缓冲,不友好
 |         //std::ios_base::sync_with_stdio(false); //加了没效果 //这里直接关掉就行了,不会影响读入,因为目前是一次性读入。开了反而会让日志输出变成全缓冲,不友好
 | ||||||
|         // using namespace std; // 别加,刚被坑了
 |         // using namespace std; // 别加,刚被坑了
 | ||||||
|          |          | ||||||
|         Spent all_spent("All spent"); //自动计时器,给主函数计时
 |         // std::array<char,MAX_SIZE> buf;
 | ||||||
|          |  | ||||||
|         std::array<char,MAX_SIZE> buf; |  | ||||||
| 
 | 
 | ||||||
|         bool lines = 0; //使用布尔值加速
 |         bool lines = 0; //使用布尔值加速
 | ||||||
| 
 | 
 | ||||||
|  |         const auto get_lines_add = [&lines]() { | ||||||
|  |             bool old_value = lines;  // 保存旧值
 | ||||||
|  |             lines = !lines;          // 改变布尔值
 | ||||||
|  |             return old_value;        // 返回旧值
 | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|         std::filesystem::path input_path("filteredReads.txt"),output_path("reversedSequence.txt"); |         std::filesystem::path input_path("filteredReads.txt"),output_path("reversedSequence.txt"); | ||||||
|          |          | ||||||
|         OPEN_IFS_AND_CHECK(input_path, input_file_stream) //创建输入和输出流
 |         OPEN_IFS_AND_CHECK(input_path, input_file_stream) //创建输入和输出流
 | ||||||
|         OPEN_OFS_AND_CHECK(output_path, output_file_stream) |         OPEN_OFS_AND_CHECK(output_path, output_file_stream) | ||||||
| 
 | 
 | ||||||
|         while (input_file_stream.getline(buf.data(),MAX_SIZE,'\n')) |         const size_t BUF_SIZE  = (size_t)4 * 1024 * 1024 *1024; //4GB + 区块大小一点冗余 ///////////////////////////设置区块大小
 | ||||||
|  |         // const size_t BUF_SIZE  = (size_t)400*1024*1024; //4GB + 一点冗余 // 测试用
 | ||||||
|  | 
 | ||||||
|  |         std::vector<char> buf(BUF_SIZE); // 堆上分配可以大一点
 | ||||||
|  |         std::array<char, MAX_SIZE_PER_DNA> tmp_buf;//用于处理截断的DNA,直接在栈上申请
 | ||||||
|  |          | ||||||
|  |         Spent all_spent("All spent"); //自动计时器,给主函数计时
 | ||||||
|  |         unsigned int chunk_id = 0; | ||||||
|  |         size_t last_buf_size = 0; | ||||||
|  |         while (input_file_stream.eof()==false) | ||||||
|         { |         { | ||||||
|  |             Spent chunk_spent(zt::fmt("chunk_id:[",++chunk_id,"]")); | ||||||
|  |             { | ||||||
|  |                 Spent chunk_read_spent(zt::fmt("read_chunk_id:[",chunk_id,"]")); | ||||||
|  |                 input_file_stream.read(buf.data(),buf.size()); | ||||||
|  |             } | ||||||
|             // lines=!lines; //防止溢出
 |             // lines=!lines; //防止溢出
 | ||||||
|             const auto buf_len = strlen(buf.data()); |             const auto buf_len = input_file_stream.gcount(); | ||||||
|             const std::string_view suffix("\n"); //设置一个每个DNA序列结尾的字符,这里是以\n换行来结尾
 | 
 | ||||||
|             if (lines == true){ |             zt::print(NAME_VALUE(buf_len),"\n"); | ||||||
|                 // output_file_stream << reverseComplement(buf) << endl;
 |              | ||||||
|                 reverseComplement(buf,buf_len); |             if(buf_len == std::numeric_limits<decltype(buf_len)>::max())[[unlikely]]{ | ||||||
|  |                 THROW_RT_ERROR("get input file stream read buf size failed\n") | ||||||
|             } |             } | ||||||
|             // buf+=suffix;
 | 
 | ||||||
|             for(std::remove_const_t<decltype(suffix.size())> i=0;i<suffix.size();i++){ |             if(buf_len == 0)[[unlikely]]{ | ||||||
|                 buf[buf_len+i] = suffix[i]; |                 break; | ||||||
|             } |             } | ||||||
|             output_file_stream.write(buf.data(), buf_len+suffix.size()); // 写入文件
 | 
 | ||||||
|             lines=!lines; //bool取反
 |             const std::string_view buf_str_v(buf.data(),buf_len); //string_view是零拷贝,但是要注意悬垂引用
 | ||||||
|  | 
 | ||||||
|  |             size_t start_pos = 0; | ||||||
|  |             size_t end_pos = 0; | ||||||
|  | 
 | ||||||
|  |             if(last_buf_size>0)[[likely]]{ | ||||||
|  |                 Spent recovery_interrupt_spent(zt::fmt("recovery_interrupt [",chunk_id,"]")); | ||||||
|  |                 if((end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos)[[likely]]{ | ||||||
|  |                     std::memcpy(tmp_buf.data()+last_buf_size,buf.data(),end_pos+1); | ||||||
|  |                     if(get_lines_add()){ | ||||||
|  |                         reverseComplement(tmp_buf.data(), tmp_buf.data()+last_buf_size+end_pos); | ||||||
|  |                     } | ||||||
|  |                     // lines=!lines;
 | ||||||
|  |                     output_file_stream.write(tmp_buf.data(), last_buf_size+end_pos+1); | ||||||
|  |                 }else{ | ||||||
|  |                     THROW_RT_ERROR("DNA incompleteness") | ||||||
|  |                 } | ||||||
|  |                 last_buf_size=0; | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             { | ||||||
|  |                 Spent calculate_spent(zt::fmt("calculate_chunk_id:[",chunk_id,"]")); | ||||||
|  | 
 | ||||||
|  |                 while((end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos){ | ||||||
|  |                     if(get_lines_add()){ | ||||||
|  |                         reverseComplement(buf.data()+start_pos, buf.data()+end_pos); | ||||||
|  |                     } | ||||||
|  |                     // lines=!lines;
 | ||||||
|  |                     start_pos=end_pos+1; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |              | ||||||
|  |             if(start_pos!=buf_len){ | ||||||
|  |                 zt::print("Saving interrupt chunk_id[",chunk_id,"]\n"); | ||||||
|  |                 std::memcpy(tmp_buf.data(),buf.data()+start_pos+1,(last_buf_size = buf_len-start_pos-1)); | ||||||
|  |             } | ||||||
|  |             { | ||||||
|  |                 Spent chunk_write_spent(zt::fmt("write_chunk_id:[",chunk_id,"] , ","[Wrote bytes] ",NAME_VALUE(start_pos))); | ||||||
|  |                 output_file_stream.write(buf.data(), start_pos); | ||||||
|  |             } | ||||||
|  |             // zt::print("[Wrote bytes] ",NAME_VALUE(start_pos),"\n");
 | ||||||
|  |             // const std::string_view suffix("\n"); //设置一个每个DNA序列结尾的字符,这里是以\n换行来结尾
 | ||||||
|  | 
 | ||||||
|  |              | ||||||
|  |             // if (lines == true){
 | ||||||
|  |             //     // output_file_stream << reverseComplement(buf) << endl;
 | ||||||
|  |             //     reverseComplement(buf,buf_len);
 | ||||||
|  |             // }
 | ||||||
|  |             // // buf+=suffix;
 | ||||||
|  |             // for(std::remove_const_t<decltype(suffix.size())> i=0;i<suffix.size();i++){
 | ||||||
|  |             //     buf[buf_len+i] = suffix[i];
 | ||||||
|  |             // }
 | ||||||
|  |             // output_file_stream.write(buf.data(), buf_len+suffix.size()); // 写入文件
 | ||||||
|  |              | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         return 0; |  | ||||||
|     }catch(const std::exception &e){ |     }catch(const std::exception &e){ | ||||||
|         zt::eprint( |         zt::eprint( | ||||||
|             "Caught an error because:\n", |             "Caught an error because:\n", | ||||||
|             "\t",NAME_VALUE(e.what()),"\n" |             "\t",NAME_VALUE(e.what()),"\n" | ||||||
|             "Closing\n" |             "Closing\n" | ||||||
|         ); |         ); | ||||||
|  |         throw  e; | ||||||
|     }catch(...){ |     }catch(...){ | ||||||
|         zt::eprint( |         zt::eprint( | ||||||
|             "Caught an unknown error :\n", |             "Caught an unknown error :\n", | ||||||
|             "Closing\n" |             "Closing\n" | ||||||
|         ); |         ); | ||||||
|  |         throw; | ||||||
|     } |     } | ||||||
|     return -1; |  | ||||||
| } | } | ||||||
|  | |||||||
| @ -3,6 +3,8 @@ | |||||||
| #include <iostream> | #include <iostream> | ||||||
| #include <ostream> | #include <ostream> | ||||||
| #include <sstream> | #include <sstream> | ||||||
|  | #include <string> | ||||||
|  | #include <utility> | ||||||
| 
 | 
 | ||||||
| #define NAME_VALUE(v)#v," : ",(v) | #define NAME_VALUE(v)#v," : ",(v) | ||||||
| 
 | 
 | ||||||
| @ -70,6 +72,14 @@ namespace zt { | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     template<class ...Args> | ||||||
|  |     inline std::string fmt(Args&&... args) { | ||||||
|  |         std::ostringstream oss; | ||||||
|  |         (oss << ... << std::forward<Args>(args)); | ||||||
|  |         return oss.str(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     #define THROW_RT_ERROR(why)throw std::runtime_error(zt::fmt("[FILE:",__FILE__,"] [LINE:",__LINE__,"] why:",why)); | ||||||
|     // template <class ...Args>
 |     // template <class ...Args>
 | ||||||
|     // inline void check_fstream_isopen(const Args&...args)noexcept(false){
 |     // inline void check_fstream_isopen(const Args&...args)noexcept(false){
 | ||||||
|     //     bool is_open=true;
 |     //     bool is_open=true;
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user