diff --git a/CMakeLists.txt b/CMakeLists.txt index 859172c..1dae334 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ foreach(SRC_CPP IN LISTS SRC_CPPS) message("found cpp : ${SRC_CPP}") endforeach() -add_executable(${PROJECT_NAME}_bin ${SRC_HPPS} ${SRC_CPPS}) +# add_executable(${PROJECT_NAME}_bin ${SRC_HPPS} ${SRC_CPPS}) add_subdirectory(pybind11) diff --git a/dna.pyi b/dna.pyi new file mode 100644 index 0000000..cacf1dd --- /dev/null +++ b/dna.pyi @@ -0,0 +1,11 @@ +# void convert_from_file( +# std::string source_file=std::string("filteredReads.txt"), +# std::string destination_file=std::string("reversedSequence.txt"), +# size_t buffer_size_gb=4 +# ) + +def convert_from_file( + source_file:str="filteredReads.txt", + destination_file:str="reversedSequence.txt", + buffer_size_gb:int=4 + )->None:... \ No newline at end of file diff --git a/src/dna.hpp b/src/dna.hpp index 7b9637e..4bcbca4 100644 --- a/src/dna.hpp +++ b/src/dna.hpp @@ -1 +1,37 @@ -#pragma once \ No newline at end of file +#pragma once +#include +#include +#include +#include +#include + +template +std::string fmt(Args&&...args){ + std::ostringstream oss; + (oss<<...<(args)); + return std::move(std::string(oss.str())); +} + +template +void print(Args...args){ + std::ostringstream oss; + (oss<<...<(args)); + std::cout< (end-start); + print("[Timer: ",name,"]"," Stop timing , used ", dur.count(),"ms\n"); + } + }; \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index aef7e59..7af3ff9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,8 +1,92 @@ #include "dna.hpp" +#include +#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include -int main(){ - +namespace py = pybind11; +static constexpr size_t default_buffer_size{512*1024*1024}; + +static const std::unordered_map complement = { //这里使用查表的方式大大提高CPU速度,因为if分支CPU不容易命中缓存,需要使用查表加速 + {'A', 'T'}, {'a', 'T'}, + {'T', 'A'}, {'t', 'A'}, + {'C', 'G'}, {'c', 'G'}, + {'G', 'C'}, {'g', 'C'} +}; + +void reverseComplement(char *begin, char *end) +{ + //注意end是开区间,不能访问end + std::reverse(begin, end); //翻转DNA序列 + + for (ptrdiff_t i = 0; i < (end - begin); ++i) { + // static int _ = (zt::print(NAME_VALUE(omp_get_num_threads()),"\n"),0); // 打印线程数量 + auto it = complement.find(begin[i]); + if (it != complement.end()) { + begin[i] = it->second; + } + } } +void convert_from_file( + std::string source_file=std::string("filteredReads.txt"), + std::string destination_file=std::string("reversedSequence.txt"), + size_t buffer_size=default_buffer_size +) +{ + // std::iostream::sync_with_stdio(false); + + const size_t max_size_pre_dna{(size_t)5e4+5}; + const size_t all_buf_size = {buffer_size}; + const size_t read_bufsize{all_buf_size/2},write_bufsize{all_buf_size/2}; + + print(NVC(max_size_pre_dna),'\n', + NVC(all_buf_size),'\n', + NVC(read_bufsize),'\n', + NVC(write_bufsize),'\n'); + std::ifstream ifs(source_file); + if(ifs.is_open()==false)throw std::runtime_error(fmt("Cannot open input file stream\nfilename: ",source_file,'\n')); + + std::ofstream ofs(destination_file); + if(ofs.is_open()==false)throw std::runtime_error(fmt("Cannot open output file stream\nfilename: ",destination_file,'\n')); + std::cout<<"Open file ok ,getting memory\n"; + std::vector read_buf(read_bufsize), write_buf(write_bufsize); + ifs.rdbuf()->pubsetbuf(read_buf.data(), read_buf.size()); + ofs.rdbuf()->pubsetbuf(write_buf.data(), write_buf.size()); + + std::array dna_buf; + bool is_dna_line{false}; + std::cout<<"computing\n"; + Spent all_spent("all_spent_time"); + while(ifs.getline(dna_buf.data(),dna_buf.size())){ + const size_t new_buflen{strlen(dna_buf.data())}; + if(is_dna_line){ + reverseComplement(dna_buf.data(), dna_buf.data()+new_buflen); + // std::cout<<"complete one ok\n"; + } + // print(NVC(new_buflen),'\n'); + // print(NVC(dna_buf.data())); + dna_buf[new_buflen]='\n'; + ofs.write(dna_buf.data(),new_buflen+1); + is_dna_line=!is_dna_line; + } + ofs.flush(); + print("done\n"); +} + +PYBIND11_MODULE(dna, m) { + m.doc() = "A dna base conversion library"; // optional module docstring + + m.def("convert_from_file", &convert_from_file, "dna base switching from file.", + py::arg("source_file")= std::string("filteredReads.txt"), + py::arg("destination_file")=std::string("reversedSequence.txt"), + py::arg("buffer_size")=default_buffer_size + ); +} \ No newline at end of file