#include "query.h" #include #include #include #include #include #include #include "function_output_iterator.hpp" #include "compilation_database_loader.h" #include "optional.h" #include "indexer.h" //#define CATCH_CONFIG_MAIN //#include "catch.hpp" // TODO: Make all copy constructors explicit. // NOTE: When not inside of a |def| object, there can be duplicates of the same // information if that information is contributed from separate sources. // If we need to avoid this duplication in the future, we will have to // add a refcount. template std::vector Transform(const std::vector& input, std::function op) { std::vector result; result.reserve(input.size()); for (const In& in : input) result.push_back(op(in)); return result; } Usr MapIdToUsr(IdCache& id_cache, const TypeId& id) { return id_cache.type_id_to_usr[id]; } Usr MapIdToUsr(IdCache& id_cache, const FuncId& id) { return id_cache.func_id_to_usr[id]; } Usr MapIdToUsr(IdCache& id_cache, const VarId& id) { return id_cache.var_id_to_usr[id]; } QueryableLocation MapIdToUsr(IdCache& id_cache, const Location& id) { return QueryableLocation(id_cache.file_id_to_file_path[id.file_id()], id.line, id.column, id.interesting); } std::vector MapIdToUsr(IdCache& id_cache, const std::vector& ids) { return Transform(ids, [&](TypeId id) { return id_cache.type_id_to_usr[id]; }); } std::vector MapIdToUsr(IdCache& id_cache, const std::vector& ids) { return Transform(ids, [&](FuncId id) { return id_cache.func_id_to_usr[id]; }); } std::vector MapIdToUsr(IdCache& id_cache, const std::vector& ids) { return Transform(ids, [&](VarId id) { return id_cache.var_id_to_usr[id]; }); } std::vector MapIdToUsr(IdCache& id_cache, const std::vector& ids) { return Transform(ids, [&](FuncRef ref) { UsrRef result; result.loc = MapIdToUsr(id_cache, ref.loc); result.usr = id_cache.func_id_to_usr[ref.id]; return result; }); } std::vector MapIdToUsr(IdCache& id_cache, const std::vector& ids) { return Transform(ids, [&](Location id) { return QueryableLocation(id_cache.file_id_to_file_path[id.file_id()], id.line, id.column, id.interesting); }); } QueryableTypeDef::DefUpdate MapIdToUsr(IdCache& id_cache, const TypeDefDefinitionData<>& def) { QueryableTypeDef::DefUpdate result(def.usr); if (result.definition) result.definition = MapIdToUsr(id_cache, def.definition.value()); if (result.alias_of) result.alias_of = MapIdToUsr(id_cache, def.alias_of.value()); result.parents = MapIdToUsr(id_cache, def.parents); result.types = MapIdToUsr(id_cache, def.types); result.funcs = MapIdToUsr(id_cache, def.funcs); result.vars = MapIdToUsr(id_cache, def.vars); return result; } QueryableFuncDef::DefUpdate MapIdToUsr(IdCache& id_cache, const FuncDefDefinitionData<>& def) { QueryableFuncDef::DefUpdate result(def.usr); if (result.definition) result.definition = MapIdToUsr(id_cache, def.definition.value()); if (result.declaring_type) result.declaring_type = MapIdToUsr(id_cache, def.declaring_type.value()); if (result.base) result.base = MapIdToUsr(id_cache, def.base.value()); result.locals = MapIdToUsr(id_cache, def.locals); result.callees = MapIdToUsr(id_cache, def.callees); return result; } QueryableVarDef::DefUpdate MapIdToUsr(IdCache& id_cache, const VarDefDefinitionData<>& def) { QueryableVarDef::DefUpdate result(def.usr); if (result.declaration) result.declaration = MapIdToUsr(id_cache, def.declaration.value()); if (result.definition) result.definition = MapIdToUsr(id_cache, def.definition.value()); if (result.variable_type) result.variable_type = MapIdToUsr(id_cache, def.variable_type.value()); if (result.declaring_type) result.declaring_type = MapIdToUsr(id_cache, def.declaring_type.value()); return result; } QueryableFile::QueryableFile(const IndexedFile& indexed) : file_id(indexed.path) { auto add_outline = [this, &indexed](Usr usr, Location location) { outline.push_back(UsrRef(usr, MapIdToUsr(*indexed.id_cache, location))); }; for (const IndexedTypeDef& def : indexed.types) { if (def.def.definition.has_value()) add_outline(def.def.usr, def.def.definition.value()); } for (const IndexedFuncDef& def : indexed.funcs) { for (Location decl : def.declarations) add_outline(def.def.usr, decl); if (def.def.definition.has_value()) add_outline(def.def.usr, def.def.definition.value()); } for (const IndexedVarDef& def : indexed.vars) { if (def.def.definition.has_value()) add_outline(def.def.usr, def.def.definition.value()); } std::sort(outline.begin(), outline.end(), [](const UsrRef& a, const UsrRef& b) { return a.loc < b.loc; }); } QueryableTypeDef::QueryableTypeDef(IdCache& id_cache, const IndexedTypeDef& indexed) : def(MapIdToUsr(id_cache, indexed.def)) { derived = MapIdToUsr(id_cache, indexed.derived); uses = MapIdToUsr(id_cache, indexed.uses); } QueryableFuncDef::QueryableFuncDef(IdCache& id_cache, const IndexedFuncDef& indexed) : def(MapIdToUsr(id_cache, indexed.def)) { declarations = MapIdToUsr(id_cache, indexed.declarations); derived = MapIdToUsr(id_cache, indexed.derived); callers = MapIdToUsr(id_cache, indexed.callers); uses = MapIdToUsr(id_cache, indexed.uses); } QueryableVarDef::QueryableVarDef(IdCache& id_cache, const IndexedVarDef& indexed) : def(MapIdToUsr(id_cache, indexed.def)) { uses = MapIdToUsr(id_cache, indexed.uses); } // TODO: For space reasons, it may make sense to map Usr -> offset inside of global storage. But not for intermediate or disk-storage. // We can probably eliminate most of that pain by coming up with our own UsrDb concept which interns the Usr strings. We can make // the pain of a global UsrDb less by // (parallel)clangindex -> (main)commit USRs to global -> (parallel)transfer IDs to global USRs -> (main)import // TODO: remove GroupId concept. struct CachedIndexedFile { // Path to the file indexed. std::string path; // TODO: Make sure that |previous_index| and |current_index| use the same id // to USR mapping. This lets us greatly speed up difference computation. // The previous index. This is used for index updates, so we only apply a // an update diff when changing the global db. optional previous_index; IndexedFile current_index; CachedIndexedFile(const IndexedFile& indexed) : current_index(indexed) {} }; template void AddRange(std::vector* dest, const std::vector& to_add) { for (const T& e : to_add) dest->push_back(e); } template void RemoveRange(std::vector* dest, const std::vector& to_remove) { auto it = std::remove_if(dest->begin(), dest->end(), [&](const T& t) { // TODO: make to_remove a set? return std::find(to_remove.begin(), to_remove.end(), t) != to_remove.end(); }); if (it != dest->end()) dest->erase(it); } // Compares |previous| and |current|, adding all elements that are // in |previous| but not |current| to |removed|, and all elements // that are in |current| but not |previous| to |added|. // // Returns true iff |removed| or |added| are non-empty. template bool ComputeDifferenceForUpdate( std::vector& previous, std::vector& current, std::vector* removed, std::vector* added) { // We need to sort to use std::set_difference. std::sort(previous.begin(), previous.end()); std::sort(current.begin(), current.end()); // Returns the elements in |previous| that are not in |current|. std::set_difference( previous.begin(), previous.end(), current.begin(), current.end(), std::back_inserter(*removed)); // Returns the elmeents in |current| that are not in |previous|. std::set_difference( current.begin(), current.end(), previous.begin(), previous.end(), std::back_inserter(*added)); return !removed->empty() || !added->empty(); } template void CompareGroups( std::vector& previous_data, std::vector& current_data, std::function on_removed, std::function on_added, std::function on_found) { std::sort(previous_data.begin(), previous_data.end()); std::sort(current_data.begin(), current_data.end()); auto prev_it = previous_data.begin(); auto curr_it = current_data.begin(); while (prev_it != previous_data.end() && curr_it != current_data.end()) { // same id if (prev_it->def.usr == curr_it->def.usr) { if (!prev_it->is_bad_def && !curr_it->is_bad_def) on_found(&*prev_it, &*curr_it); else if (prev_it->is_bad_def) on_added(&*curr_it); else if (curr_it->is_bad_def) on_removed(&*curr_it); ++prev_it; ++curr_it; } // prev_id is smaller - prev_it has data curr_it does not have. else if (prev_it->def.usr < curr_it->def.usr) { on_removed(&*prev_it); ++prev_it; } // prev_id is bigger - curr_it has data prev_it does not have. else { on_added(&*curr_it); ++curr_it; } } // if prev_it still has data, that means it is not in curr_it and was removed. while (prev_it != previous_data.end()) { on_removed(&*prev_it); ++prev_it; } // if curr_it still has data, that means it is not in prev_it and was added. while (curr_it != current_data.end()) { on_added(&*curr_it); ++curr_it; } } IndexUpdate::IndexUpdate(IndexedFile& file) { files_added.push_back(QueryableFile(file)); for (const IndexedTypeDef& def : file.types) types_added.push_back(QueryableTypeDef(*file.id_cache, def)); for (const IndexedFuncDef& def : file.funcs) funcs_added.push_back(QueryableFuncDef(*file.id_cache, def)); for (const IndexedVarDef& def : file.vars) vars_added.push_back(QueryableVarDef(*file.id_cache, def)); } IndexUpdate::IndexUpdate(IndexedFile& previous_file, IndexedFile& current_file) { #define JOIN(a, b) a##b // |query_name| is the name of the variable on the query type. // |index_name| is the name of the variable on the index type. // |type| is the type of the variable. #define PROCESS_UPDATE_DIFF(query_name, index_name, type) \ { \ /* Check for changes. */ \ std::vector removed, added; \ bool did_add = ComputeDifferenceForUpdate( \ MapIdToUsr(*previous_file.id_cache, JOIN(previous_def->, index_name)), \ MapIdToUsr(*current_file.id_cache, JOIN(current_def->, index_name)), \ &removed, &added); \ if (did_add) {\ std::cout << "Adding mergeable update on " << current_def->def.short_name << " (" << current_def->def.usr << ") for field " << #index_name << std::endl; \ query_name.push_back(MergeableUpdate(current_def->def.usr, removed, added)); \ } \ } // File do { // Outline is a special property and needs special handling, because it is a computed property // of the IndexedFile (ie, to view it we need to construct a QueryableFile instance). assert(previous_file.path == current_file.path); QueryableFile previous_queryable_file(previous_file); QueryableFile current_queryable_file(previous_file); std::vector removed, added; bool did_add = ComputeDifferenceForUpdate( previous_queryable_file.outline, current_queryable_file.outline, &removed, &added); if (did_add) { std::cout << "Adding mergeable update on outline (" << current_file.path << ")" << std::endl; files_outline.push_back(MergeableUpdate(current_file.path, removed, added)); } } while (false); // do while false instead of just {} to appease Visual Studio code formatter. // Types CompareGroups(previous_file.types, current_file.types, /*onRemoved:*/[this](IndexedTypeDef* def) { types_removed.push_back(def->def.usr); }, /*onAdded:*/[this, ¤t_file](IndexedTypeDef* def) { types_added.push_back(QueryableTypeDef(*current_file.id_cache, *def)); }, /*onFound:*/[this, &previous_file, ¤t_file](IndexedTypeDef* previous_def, IndexedTypeDef* current_def) { QueryableTypeDef::DefUpdate previous_remapped_def = MapIdToUsr(*previous_file.id_cache, previous_def->def); QueryableTypeDef::DefUpdate current_remapped_def = MapIdToUsr(*current_file.id_cache, current_def->def); if (previous_remapped_def != current_remapped_def) types_def_changed.push_back(current_remapped_def); PROCESS_UPDATE_DIFF(types_derived, derived, Usr); PROCESS_UPDATE_DIFF(types_uses, uses, QueryableLocation); }); // Functions CompareGroups(previous_file.funcs, current_file.funcs, /*onRemoved:*/[this](IndexedFuncDef* def) { funcs_removed.push_back(def->def.usr); }, /*onAdded:*/[this, ¤t_file](IndexedFuncDef* def) { funcs_added.push_back(QueryableFuncDef(*current_file.id_cache, *def)); }, /*onFound:*/[this, &previous_file, ¤t_file](IndexedFuncDef* previous_def, IndexedFuncDef* current_def) { QueryableFuncDef::DefUpdate previous_remapped_def = MapIdToUsr(*previous_file.id_cache, previous_def->def); QueryableFuncDef::DefUpdate current_remapped_def = MapIdToUsr(*current_file.id_cache, current_def->def); if (previous_remapped_def != current_remapped_def) funcs_def_changed.push_back(current_remapped_def); PROCESS_UPDATE_DIFF(funcs_declarations, declarations, QueryableLocation); PROCESS_UPDATE_DIFF(funcs_derived, derived, Usr); PROCESS_UPDATE_DIFF(funcs_callers, callers, UsrRef); PROCESS_UPDATE_DIFF(funcs_uses, uses, QueryableLocation); }); // Variables CompareGroups(previous_file.vars, current_file.vars, /*onRemoved:*/[this](IndexedVarDef* def) { vars_removed.push_back(def->def.usr); }, /*onAdded:*/[this, ¤t_file](IndexedVarDef* def) { vars_added.push_back(QueryableVarDef(*current_file.id_cache, *def)); }, /*onFound:*/[this, &previous_file, ¤t_file](IndexedVarDef* previous_def, IndexedVarDef* current_def) { QueryableVarDef::DefUpdate previous_remapped_def = MapIdToUsr(*previous_file.id_cache, previous_def->def); QueryableVarDef::DefUpdate current_remapped_def = MapIdToUsr(*current_file.id_cache, current_def->def); if (previous_remapped_def != current_remapped_def) vars_def_changed.push_back(current_remapped_def); PROCESS_UPDATE_DIFF(vars_uses, uses, QueryableLocation); }); #undef PROCESS_UPDATE_DIFF #undef JOIN } void IndexUpdate::Merge(const IndexUpdate& update) { #define INDEX_UPDATE_MERGE(name) \ AddRange(&name, update.##name); INDEX_UPDATE_MERGE(files_removed); INDEX_UPDATE_MERGE(files_added); INDEX_UPDATE_MERGE(files_outline); INDEX_UPDATE_MERGE(types_removed); INDEX_UPDATE_MERGE(types_added); INDEX_UPDATE_MERGE(types_def_changed); INDEX_UPDATE_MERGE(types_derived); INDEX_UPDATE_MERGE(types_uses); INDEX_UPDATE_MERGE(funcs_removed); INDEX_UPDATE_MERGE(funcs_added); INDEX_UPDATE_MERGE(funcs_def_changed); INDEX_UPDATE_MERGE(funcs_declarations); INDEX_UPDATE_MERGE(funcs_derived); INDEX_UPDATE_MERGE(funcs_callers); INDEX_UPDATE_MERGE(funcs_uses); INDEX_UPDATE_MERGE(vars_removed); INDEX_UPDATE_MERGE(vars_added); INDEX_UPDATE_MERGE(vars_def_changed); INDEX_UPDATE_MERGE(vars_uses); #undef INDEX_UPDATE_MERGE } void QueryableDatabase::RemoveUsrs(const std::vector& to_remove) { // TODO: Removing usrs is tricky because it means we will have to rebuild idx locations. I'm thinking we just nullify // the entry instead of actually removing the data. The index could be massive. for (Usr usr : to_remove) usr_to_symbol[usr].kind = SymbolKind::Invalid; } void QueryableDatabase::Import(const std::vector& defs) { for (auto& def : defs) { usr_to_symbol[def.file_id] = SymbolIdx(SymbolKind::File, files.size()); files.push_back(def); } } void QueryableDatabase::Import(const std::vector& defs) { for (auto& def : defs) { usr_to_symbol[def.def.usr] = SymbolIdx(SymbolKind::Type, types.size()); types.push_back(def); } } void QueryableDatabase::Import(const std::vector& defs) { for (auto& def : defs) { usr_to_symbol[def.def.usr] = SymbolIdx(SymbolKind::Func, funcs.size()); funcs.push_back(def); } } void QueryableDatabase::Import(const std::vector& defs) { for (auto& def : defs) { usr_to_symbol[def.def.usr] = SymbolIdx(SymbolKind::Var, vars.size()); vars.push_back(def); } } void QueryableDatabase::Update(const std::vector& updates) { for (auto& def : updates) { SymbolIdx idx = usr_to_symbol[def.usr]; assert(idx.kind == SymbolKind::Type); types[idx.idx].def = def; } } void QueryableDatabase::Update(const std::vector& updates) { for (auto& def : updates) { SymbolIdx idx = usr_to_symbol[def.usr]; assert(idx.kind == SymbolKind::Func); funcs[idx.idx].def = def; } } void QueryableDatabase::Update(const std::vector& updates) { for (auto& def : updates) { SymbolIdx idx = usr_to_symbol[def.usr]; assert(idx.kind == SymbolKind::Var); vars[idx.idx].def = def; } } template void AddAll(std::unordered_map* id_map, std::vector* defs, const std::vector& to_add) { for (const TDef& def : to_add) { (*id_map)[def.def.id] = defs->size(); defs->push_back(def); } } template void ApplyUpdates(std::unordered_map* id_map, std::vector* defs, const std::vector& updates) { for (const typename TDef::DefUpdate& def : updates) { TId id = def.id; int index = (*id_map)[id]; (*defs)[index].def = def; } } void QueryableDatabase::ApplyIndexUpdate(IndexUpdate* update) { #define JOIN(a, b) a##b #define HANDLE_MERGEABLE(update_var_name, def_var_name, storage_name) \ for (auto merge_update : JOIN(update->, update_var_name)) { \ SymbolIdx index = usr_to_symbol[merge_update.usr]; \ auto* def = &JOIN(storage_name, [index.idx]); \ AddRange(JOIN(&def->, def_var_name), merge_update.to_add); \ RemoveRange(JOIN(&def->, def_var_name), merge_update.to_remove); \ } RemoveUsrs(update->files_removed); Import(update->files_added); HANDLE_MERGEABLE(files_outline, outline, files); RemoveUsrs(update->types_removed); Import(update->types_added); Update(update->types_def_changed); HANDLE_MERGEABLE(types_derived, derived, types); HANDLE_MERGEABLE(types_uses, uses, types); RemoveUsrs(update->funcs_removed); Import(update->funcs_added); Update(update->funcs_def_changed); HANDLE_MERGEABLE(funcs_declarations, declarations, funcs); HANDLE_MERGEABLE(funcs_derived, derived, funcs); HANDLE_MERGEABLE(funcs_callers, callers, funcs); HANDLE_MERGEABLE(funcs_uses, uses, funcs); RemoveUsrs(update->vars_removed); Import(update->vars_added); Update(update->vars_def_changed); HANDLE_MERGEABLE(vars_uses, uses, vars); #undef HANDLE_MERGEABLE #undef JOIN } int main233(int argc, char** argv) { // TODO: Unify UserToIdResolver and FileDb IdCache id_cache(1); IndexedFile indexed_file_a = Parse(&id_cache, "full_tests/index_delta/a_v0.cc", {}); std::cout << indexed_file_a.ToString() << std::endl; std::cout << std::endl; IndexedFile indexed_file_b = Parse(&id_cache, "full_tests/index_delta/a_v1.cc", {}); std::cout << indexed_file_b.ToString() << std::endl; // TODO: We don't need to do ID remapping when computting a diff. Well, we need to do it for the IndexUpdate. IndexUpdate import(indexed_file_a); /* dest_ids.Import(indexed_file_b.file_db, indexed_file_b.id_cache); IndexUpdate update = ComputeDiff(indexed_file_a, indexed_file_b); */ QueryableDatabase db; db.ApplyIndexUpdate(&import); //db.ApplyIndexUpdate(&update); return 0; } // TODO: Idea: when indexing and joining to the main db, allow many dbs that // are joined to. So that way even if the main db is busy we can // still be joining. Joining the partially joined db to the main // db should be faster since we will have larger data lanes to use. // TODO: I think we can run libclang multiple times in one process. So we might // only need two processes. Still, for perf reasons it would be good if // we could stay in one process. We could probably just use shared // memory. May want to run libclang in separate process to protect from // crashes/issues there. // TODO: allow user to store configuration as json? file in home dir; also // allow local overrides (scan up dirs) // TODO: add opt to dump config when starting (--dump-config) // TODO: allow user to decide some indexer choices, ie, do we mark prototype parameters as usages?