DNASequence/src/tools/dna.hpp
2024-09-20 15:47:58 +08:00

170 lines
7.5 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <array>
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <fstream>
#include <functional>
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include "tools.hpp" // 自己写的库在src/tools/tools.hpp当中注意要使用C++23标准编译
#include <cstring>
#include <stdio.h>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
#include <omp.h>
#include <filesystem>
// 这两个宏用来申请读入和读出流,实现反射并输出日志,获取申请流的变量名字
#define OPEN_IFS_AND_CHECK(file_path,value_name)std::ifstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open input file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open input file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");}
#define OPEN_OFS_AND_CHECK(file_path,value_name)std::ofstream value_name(file_path);if(value_name.is_open()==false){std::stringstream ss;ss<<"cannot open output file stream : "<<file_path.filename();throw std::runtime_error(ss.str());}else{zt::print("Open output file stream to value ["#value_name"] ok , from [",file_path.filename(),"]\n");}
namespace dna {
//最大DNA序列长度
// const size_t MAX_SIZE_PER_DNA = 5e4+5;
//
class Spent{ // 使用RAII原理的自动计时器计算主函数运行时间析构时自动输出
private:
const decltype(std::chrono::system_clock::now()) start;
const std::string name;
public:
Spent(const std::string name)noexcept:start(std::chrono::system_clock::now()),name(name){
zt::print("[Timer: ",name,"]"," Start timing","\n");
}
~Spent()noexcept{
const auto end = std::chrono::system_clock::now();
const auto dur = std::chrono::duration_cast<std::chrono::milliseconds> (end-start);
zt::print("[Timer: ",name,"]"," Stop timing , used ", dur.count(),"ms\n");
}
};
template<size_t BUF_SIZE = (size_t)4 * 1024 * 1024 *1024 , size_t MAX_SIZE_PER_DNA = (size_t)5e4+5>
inline void open_file_and_calculate(std::filesystem::path input_path,std::filesystem::path output_path, std::function<void(char *begin, char *end)> reverseComplement){
//std::ios_base::sync_with_stdio(false); //加了没效果 //这里直接关掉就行了,不会影响读入,因为目前是一次性读入。开了反而会让日志输出变成全缓冲,不友好
// using namespace std; // 别加,刚被坑了
// std::array<char,MAX_SIZE> buf;
bool lines = 0; //使用布尔值加速
const auto get_lines_add = [&lines]() {
bool old_value = lines; // 保存旧值
lines = !lines; // 改变布尔值
return old_value; // 返回旧值
};
// std::filesystem::path input_path("filteredReads.txt"),output_path("reversedSequence.txt");
OPEN_IFS_AND_CHECK(input_path, input_file_stream) //创建输入和输出流
OPEN_OFS_AND_CHECK(output_path, output_file_stream)
// const size_t BUF_SIZE = (size_t)4 * 1024 * 1024 *1024; //4GB///////////////////////////设置区块大小
// const size_t BUF_SIZE = (size_t)400*1024*1024; //4GB + 一点冗余 // 测试用
std::vector<char> buf(BUF_SIZE); // 堆上分配可以大一点
std::array<char, MAX_SIZE_PER_DNA> tmp_buf;//用于处理截断的DNA直接在栈上申请
zt::print("Chunk size :",BUF_SIZE," bytes\n");
Spent all_spent("All spent"); //自动计时器,给主函数计时
unsigned int chunk_id = 0;
size_t last_buf_size = 0;
while (input_file_stream.eof()==false)
{
size_t recovered_size = 0;
Spent chunk_spent(zt::fmt("chunk_id:[",++chunk_id,"]"));
{
Spent chunk_read_spent(zt::fmt("read_chunk_id:[",chunk_id,"]"));
input_file_stream.read(buf.data(),buf.size());
}
// lines=!lines; //防止溢出
const auto buf_len = input_file_stream.gcount();
zt::print(NAME_VALUE(buf_len),"\n");
if(buf_len == std::numeric_limits<decltype(buf_len)>::max())[[unlikely]]{
THROW_RT_ERROR("get input file stream read buf size failed\n")
}
if(buf_len == 0)[[unlikely]]{
break;
}
const std::string_view buf_str_v(buf.data(),buf_len); //string_view是零拷贝但是要注意悬垂引用
size_t start_pos = 0;
size_t end_pos = 0;
if( last_buf_size > 0 ) [[likely]] {
Spent recovery_interrupt_spent(zt::fmt("recovery_interrupt [",chunk_id,"]"));
if( (end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos ) [[likely]] {
std::memcpy(tmp_buf.data()+last_buf_size,buf.data(),end_pos+1); //+1加的是空格位置
if(get_lines_add()){
reverseComplement(tmp_buf.data(), tmp_buf.data()+last_buf_size+end_pos); //注意传入函数时不要传入空格
}
// lines=!lines;
output_file_stream.write(tmp_buf.data(), last_buf_size+end_pos+1); //+1加的是空格位置
start_pos=end_pos+1;
recovered_size=end_pos+1;
}else{
THROW_RT_ERROR("DNA incompleteness, chunk Size is too small !!")
}
last_buf_size=0;
}
{
Spent calculate_spent(zt::fmt("calculate_chunk_id:[",chunk_id,"]"));
#pragma omp parallel
#pragma omp single
{
while ((end_pos = buf_str_v.find('\n', start_pos)) != std::string_view::npos) {
char *start = buf.data() + start_pos;
char *end = buf.data() + end_pos;
// 确保每个任务只操作属于它的独立数据
if (get_lines_add()) {
// 为每一块数据创建一个任务
#pragma omp task
// #pragma omp task firstprivate(start, end)
{
reverseComplement(start, end); // 任务内处理反转互补
}
}
start_pos = end_pos + 1;
}
}
// while((end_pos=buf_str_v.find('\n',start_pos)) != std::string_view::npos){
// if(get_lines_add()){
// reverseComplement(buf.data()+start_pos, buf.data()+end_pos); //这个函数并行优化
// }
// // lines=!lines;
// start_pos=end_pos+1;
// }
}
if(start_pos!=buf_len){
zt::print("Saving interrupt chunk_id[",chunk_id,"]\n");
std::memcpy(tmp_buf.data(),buf.data()+start_pos,(last_buf_size = buf_len-start_pos)); //这里边界处理脑子要炸了
}
{
Spent chunk_write_spent(zt::fmt("write_chunk_id:[",chunk_id,"] , ","[Wrote bytes] ",NAME_VALUE(start_pos)));
output_file_stream.write(buf.data()+recovered_size, start_pos-recovered_size);
}
}
}
}