From 5fbe53b562d9af092e4b35b2b692d3fc0882d845 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 29 Jul 2019 04:08:52 +0300 Subject: [PATCH 01/21] Speedup "symbolizeAddress" function --- dbms/src/Common/SymbolIndex.cpp | 196 ++++++++++++++++++++++++ dbms/src/Common/SymbolIndex.h | 39 +++++ dbms/src/Common/tests/CMakeLists.txt | 3 + dbms/src/Common/tests/symbol_index.cpp | 39 +++++ dbms/src/Functions/symbolizeAddress.cpp | 38 +---- 5 files changed, 284 insertions(+), 31 deletions(-) create mode 100644 dbms/src/Common/SymbolIndex.cpp create mode 100644 dbms/src/Common/SymbolIndex.h create mode 100644 dbms/src/Common/tests/symbol_index.cpp diff --git a/dbms/src/Common/SymbolIndex.cpp b/dbms/src/Common/SymbolIndex.cpp new file mode 100644 index 0000000000..ea199a29ac --- /dev/null +++ b/dbms/src/Common/SymbolIndex.cpp @@ -0,0 +1,196 @@ +#include +#include + +#include +#include + +#include + +#include + + +namespace +{ + +/// Based on the code of musl-libc and the answer of Kanalpiroge on +/// https://stackoverflow.com/questions/15779185/list-all-the-functions-symbols-on-the-fly-in-c-code-on-a-linux-architecture + +/* Callback for dl_iterate_phdr. + * Is called by dl_iterate_phdr for every loaded shared lib until something + * else than 0 is returned by one call of this function. + */ +int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) +{ + /* ElfW is a macro that creates proper typenames for the used system architecture + * (e.g. on a 32 bit system, ElfW(Dyn*) becomes "Elf32_Dyn*") + */ + + std::vector & symbols = *reinterpret_cast *>(out_symbols); + + /* Iterate over all headers of the current shared lib + * (first call is for the executable itself) */ + for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) + { + /* Further processing is only needed if the dynamic section is reached + */ + if (info->dlpi_phdr[header_index].p_type != PT_DYNAMIC) + continue; + +// std::cerr << info->dlpi_name << "\n"; + + /* Get a pointer to the first entry of the dynamic section. + * It's address is the shared lib's address + the virtual address + */ + const ElfW(Dyn *) dyn_begin = reinterpret_cast(info->dlpi_addr + info->dlpi_phdr[header_index].p_vaddr); + +// std::cerr << "dlpi_addr: " << info->dlpi_addr << "\n"; + + /// For unknown reason, addresses are sometimes relative sometimes absolute. + auto correct_address = [](ElfW(Addr) base, ElfW(Addr) ptr) + { + return ptr > base ? ptr : base + ptr; + }; + + /* Iterate over all entries of the dynamic section until the + * end of the symbol table is reached. This is indicated by + * an entry with d_tag == DT_NULL. + */ + +/* for (auto it = dyn_begin; it->d_tag != DT_NULL; ++it) + std::cerr << it->d_tag << "\n";*/ + + size_t sym_cnt = 0; + for (auto it = dyn_begin; it->d_tag != DT_NULL; ++it) + { + if (it->d_tag == DT_HASH) + { + const ElfW(Word *) hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + +// std::cerr << it->d_un.d_ptr << ", " << it->d_un.d_val << "\n"; + + sym_cnt = hash[1]; + break; + } + else if (it->d_tag == DT_GNU_HASH) + { +// std::cerr << it->d_un.d_ptr << ", " << it->d_un.d_val << "\n"; + + /// This code based on Musl-libc. + + const uint32_t * buckets = nullptr; + const uint32_t * hashval = nullptr; + + const ElfW(Word *) hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + + buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); + + for (ElfW(Word) i = 0; i < hash[0]; ++i) + if (buckets[i] > sym_cnt) + sym_cnt = buckets[i]; + + if (sym_cnt) + { + sym_cnt -= hash[1]; + hashval = buckets + hash[0] + sym_cnt; + do + { + ++sym_cnt; + } + while (!(*hashval++ & 1)); + } + + break; + } + } + +// std::cerr << "sym_cnt: " << sym_cnt << "\n"; + if (!sym_cnt) + continue; + + const char * strtab = nullptr; + for (auto it = dyn_begin; it->d_tag != DT_NULL; ++it) + { + if (it->d_tag == DT_STRTAB) + { + strtab = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + break; + } + } + + if (!strtab) + continue; + +// std::cerr << "Having strtab" << "\n"; + + for (auto it = dyn_begin; it->d_tag != DT_NULL; ++it) + { + if (it->d_tag == DT_SYMTAB) + { + /* Get the pointer to the first entry of the symbol table */ + const ElfW(Sym *) elf_sym = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + + /* Iterate over the symbol table */ + for (ElfW(Word) sym_index = 0; sym_index < sym_cnt; ++sym_index) + { + /* Get the name of the sym_index-th symbol. + * This is located at the address of st_name relative to the beginning of the string table. + */ + const char * sym_name = &strtab[elf_sym[sym_index].st_name]; + + if (!sym_name) + continue; + +// std::cerr << sym_name << "\n"; + + DB::SymbolIndex::Symbol symbol; + symbol.address_begin = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value); + symbol.address_end = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value + elf_sym[sym_index].st_size); + int unused = 0; + symbol.name = demangle(sym_name, unused); + symbol.object = info->dlpi_name; + + symbols.push_back(std::move(symbol)); + } + + break; + } + } + } + + /* Continue iterations */ + return 0; +} + +} + + +namespace DB +{ + +void SymbolIndex::update() +{ + dl_iterate_phdr(collectSymbols, &symbols); + std::sort(symbols.begin(), symbols.end()); +} + +const SymbolIndex::Symbol * SymbolIndex::find(const void * address) const +{ + /// First range that has left boundary greater than address. + +// std::cerr << "Searching " << address << "\n"; + + auto it = std::lower_bound(symbols.begin(), symbols.end(), address); + if (it == symbols.begin()) + return nullptr; + else + --it; /// Last range that has left boundary less or equals than address. + +// std::cerr << "Range: " << it->address_begin << " ... " << it->address_end << "\n"; + + if (address >= it->address_begin && address < it->address_end) + return &*it; + else + return nullptr; +} + +} diff --git a/dbms/src/Common/SymbolIndex.h b/dbms/src/Common/SymbolIndex.h new file mode 100644 index 0000000000..b6b8284d2a --- /dev/null +++ b/dbms/src/Common/SymbolIndex.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +/** Allow to quickly find symbol name from address. + * Used as a replacement for "dladdr" function which is extremely slow. + */ +class SymbolIndex +{ +public: + struct Symbol + { + const void * address_begin; + const void * address_end; + const char * object; + std::string name; /// demangled + + bool operator< (const Symbol & rhs) const { return address_begin < rhs.address_begin; } + bool operator< (const void * addr) const { return address_begin <= addr; } + }; + + SymbolIndex() { update(); } + void update(); + + const Symbol * find(const void * address) const; + + auto begin() const { return symbols.cbegin(); } + auto end() const { return symbols.cend(); } + +private: + std::vector symbols; +}; + +} diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index 83d69b2c8f..d11a46a38d 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -78,3 +78,6 @@ target_link_libraries (stopwatch PRIVATE clickhouse_common_io) add_executable (mi_malloc_test mi_malloc_test.cpp) target_link_libraries (mi_malloc_test PRIVATE clickhouse_common_io) + +add_executable (symbol_index symbol_index.cpp) +target_link_libraries (symbol_index PRIVATE clickhouse_common_io) diff --git a/dbms/src/Common/tests/symbol_index.cpp b/dbms/src/Common/tests/symbol_index.cpp new file mode 100644 index 0000000000..a9fec7069e --- /dev/null +++ b/dbms/src/Common/tests/symbol_index.cpp @@ -0,0 +1,39 @@ +#include +#include +#include +#include + + +void f() {} + +using namespace DB; + +int main(int argc, char ** argv) +{ + if (argc < 2) + { + std::cerr << "Usage: ./symbol_index address\n"; + return 1; + } + + SymbolIndex symbol_index; + + for (const auto & symbol : symbol_index) + std::cout << symbol.name << ": " << symbol.address_begin << " ... " << symbol.address_end << "\n"; + + const void * address = reinterpret_cast(std::stoull(argv[1], nullptr, 16)); + + auto symbol = symbol_index.find(address); + if (symbol) + std::cerr << symbol->name << ": " << symbol->address_begin << " ... " << symbol->address_end << "\n"; + else + std::cerr << "Not found\n"; + + Dl_info info; + if (dladdr(address, &info) && info.dli_sname) + std::cerr << demangle(info.dli_sname) << ": " << info.dli_saddr << "\n"; + else + std::cerr << "Not found\n"; + + return 0; +} diff --git a/dbms/src/Functions/symbolizeAddress.cpp b/dbms/src/Functions/symbolizeAddress.cpp index 1096a8924b..65c1aa84d3 100644 --- a/dbms/src/Functions/symbolizeAddress.cpp +++ b/dbms/src/Functions/symbolizeAddress.cpp @@ -1,9 +1,4 @@ -#include -#include -#include -#include -#include -#include +#include #include #include #include @@ -63,25 +58,10 @@ public: return true; } - static std::string addressToSymbol(UInt64 uint_address) - { - void * addr = unalignedLoad(&uint_address); - - /// This is extremely slow. - Dl_info info; - if (dladdr(addr, &info) && info.dli_sname) - { - int demangling_status = 0; - return demangle(info.dli_sname, demangling_status); - } - else - { - return {}; - } - } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override { + static SymbolIndex symbol_index; + const ColumnPtr & column = block.getByPosition(arguments[0]).column; const ColumnUInt64 * column_concrete = checkAndGetColumn(column.get()); @@ -91,19 +71,15 @@ public: const typename ColumnVector::Container & data = column_concrete->getData(); auto result_column = ColumnString::create(); - static SimpleCache func_cached; - for (size_t i = 0; i < input_rows_count; ++i) { - std::string symbol = func_cached(data[i]); - result_column->insertDataWithTerminatingZero(symbol.data(), symbol.size() + 1); + if (const auto * symbol = symbol_index.find(reinterpret_cast(data[i]))) + result_column->insertDataWithTerminatingZero(symbol->name.data(), symbol->name.size() + 1); + else + result_column->insertDefault(); } block.getByPosition(result).column = std::move(result_column); - - /// Do not let our cache to grow indefinitely (simply drop it) - if (func_cached.size() > 1000000) - func_cached.drop(); } }; From 300d25256e9bfd8b9958bc3bf95205897187ac30 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 29 Jul 2019 15:48:19 +0300 Subject: [PATCH 02/21] Include private symbols in stack traces of QueryProfiler --- dbms/src/Common/SymbolIndex.cpp | 228 +++++++++++++++++++++++++++++--- 1 file changed, 209 insertions(+), 19 deletions(-) diff --git a/dbms/src/Common/SymbolIndex.cpp b/dbms/src/Common/SymbolIndex.cpp index ea199a29ac..f8893bd4ac 100644 --- a/dbms/src/Common/SymbolIndex.cpp +++ b/dbms/src/Common/SymbolIndex.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -6,27 +7,25 @@ #include -#include +//#include +#include namespace { +/// Notes: "PHDR" is "Program Headers". +/// To look at program headers, you can run: objdump -p ./clickhouse-server +/// Also look at: https://wiki.osdev.org/ELF +/// Also look at: man elf +/// http://www.linker-aliens.org/blogs/ali/entry/inside_elf_symbol_tables/ +/// https://stackoverflow.com/questions/32088140/multiple-string-tables-in-elf-object + + /// Based on the code of musl-libc and the answer of Kanalpiroge on /// https://stackoverflow.com/questions/15779185/list-all-the-functions-symbols-on-the-fly-in-c-code-on-a-linux-architecture - -/* Callback for dl_iterate_phdr. - * Is called by dl_iterate_phdr for every loaded shared lib until something - * else than 0 is returned by one call of this function. - */ -int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) +void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vector & symbols) { - /* ElfW is a macro that creates proper typenames for the used system architecture - * (e.g. on a 32 bit system, ElfW(Dyn*) becomes "Elf32_Dyn*") - */ - - std::vector & symbols = *reinterpret_cast *>(out_symbols); - /* Iterate over all headers of the current shared lib * (first call is for the executable itself) */ for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) @@ -36,12 +35,10 @@ int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) if (info->dlpi_phdr[header_index].p_type != PT_DYNAMIC) continue; -// std::cerr << info->dlpi_name << "\n"; - /* Get a pointer to the first entry of the dynamic section. * It's address is the shared lib's address + the virtual address */ - const ElfW(Dyn *) dyn_begin = reinterpret_cast(info->dlpi_addr + info->dlpi_phdr[header_index].p_vaddr); + const ElfW(Dyn) * dyn_begin = reinterpret_cast(info->dlpi_addr + info->dlpi_phdr[header_index].p_vaddr); // std::cerr << "dlpi_addr: " << info->dlpi_addr << "\n"; @@ -64,7 +61,7 @@ int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) { if (it->d_tag == DT_HASH) { - const ElfW(Word *) hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); // std::cerr << it->d_un.d_ptr << ", " << it->d_un.d_val << "\n"; @@ -80,7 +77,7 @@ int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) const uint32_t * buckets = nullptr; const uint32_t * hashval = nullptr; - const ElfW(Word *) hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); @@ -127,11 +124,15 @@ int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) if (it->d_tag == DT_SYMTAB) { /* Get the pointer to the first entry of the symbol table */ - const ElfW(Sym *) elf_sym = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Sym) * elf_sym = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); /* Iterate over the symbol table */ for (ElfW(Word) sym_index = 0; sym_index < sym_cnt; ++sym_index) { + /// We are not interested in empty symbols. + if (!elf_sym[sym_index].st_size) + continue; + /* Get the name of the sym_index-th symbol. * This is located at the address of st_name relative to the beginning of the string table. */ @@ -156,6 +157,195 @@ int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) } } } +} + + +void collectSymbolsFromELFSymbolTable( + dl_phdr_info * info, + const char * mapped_elf, + size_t elf_size, + const ElfW(Shdr) * symbol_table, + const ElfW(Shdr) * string_table, + std::vector & symbols) +{ + if (symbol_table->sh_offset + symbol_table->sh_size > elf_size + || string_table->sh_offset + string_table->sh_size > elf_size) + return; + + /// Iterate symbol table. + const ElfW(Sym) * symbol_table_entry = reinterpret_cast(mapped_elf + symbol_table->sh_offset); + const ElfW(Sym) * symbol_table_end = reinterpret_cast(mapped_elf + symbol_table->sh_offset + symbol_table->sh_size); + +// std::cerr << "Symbol table has: " << (symbol_table_end - symbol_table_entry) << "\n"; + + const char * strings = reinterpret_cast(mapped_elf + string_table->sh_offset); + + for (; symbol_table_entry < symbol_table_end; ++symbol_table_entry) + { + if (!symbol_table_entry->st_name + || !symbol_table_entry->st_value + || !symbol_table_entry->st_size + || string_table->sh_offset + symbol_table_entry->st_name >= elf_size) + continue; + +// std::cerr << "Symbol Ok" << "\n"; + + /// Find the name in strings table. + const char * symbol_name = strings + symbol_table_entry->st_name; + +// std::cerr << "Symbol name: " << symbol_name << "\n"; + + DB::SymbolIndex::Symbol symbol; + symbol.address_begin = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value); + symbol.address_end = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value + symbol_table_entry->st_size); + int unused = 0; + symbol.name = demangle(symbol_name, unused); + symbol.object = info->dlpi_name; + + symbols.push_back(std::move(symbol)); + } +} + + +bool collectSymbolsFromELFSymbolTable( + dl_phdr_info * info, + const char * mapped_elf, + size_t elf_size, + const ElfW(Shdr) * section_headers, + size_t section_header_num_entries, + ElfW(Off) section_names_offset, + const char * section_names, + ElfW(Word) section_header_type, + const char * string_table_name, + std::vector & symbols) +{ + const ElfW(Shdr) * symbol_table = nullptr; + const ElfW(Shdr) * string_table = nullptr; + + for (size_t section_header_idx = 0; section_header_idx < section_header_num_entries; ++section_header_idx) + { + auto & entry = section_headers[section_header_idx]; + +// std::cerr << entry.sh_type << ", " << (section_names + entry.sh_name) << "\n"; + + if (section_names_offset + entry.sh_name >= elf_size) + return false; + + if (entry.sh_type == section_header_type) + symbol_table = &entry; + else if (entry.sh_type == SHT_STRTAB && 0 == strcmp(section_names + entry.sh_name, string_table_name)) + string_table = &entry; + + if (symbol_table && string_table) + break; + } + + if (!symbol_table || !string_table) + return false; + +// std::cerr << "Found tables for " << string_table_name << "\n"; + + collectSymbolsFromELFSymbolTable(info, mapped_elf, elf_size, symbol_table, string_table, symbols); + return true; +} + + +void collectSymbolsFromELF(dl_phdr_info * info, std::vector & symbols) +{ + std::string object_name = info->dlpi_name; + + /// If the name is empty - it's main executable. + /// Find a elf file for the main executable. + + if (object_name.empty()) + object_name = "/proc/self/exe"; + + std::error_code ec; + object_name = std::filesystem::canonical(object_name, ec); + + if (ec) + return; + +// std::cerr << object_name << "\n"; + + /// Read elf file. + DB::MMapReadBufferFromFile in(object_name, 0); + + /// Check if it's an elf. + size_t elf_size = in.buffer().size(); + if (elf_size < sizeof(ElfW(Ehdr))) + return; + +// std::cerr << "Size Ok" << "\n"; + + const char * mapped_elf = in.buffer().begin(); + const ElfW(Ehdr) * elf_header = reinterpret_cast(mapped_elf); + + if (memcmp(elf_header->e_ident, "\x7F""ELF", 4) != 0) + return; + +// std::cerr << "Header Ok" << "\n"; + + /// Get section header. + ElfW(Off) section_header_offset = elf_header->e_shoff; + uint16_t section_header_num_entries = elf_header->e_shnum; + +// std::cerr << section_header_offset << ", " << section_header_num_entries << ", " << (section_header_num_entries * sizeof(ElfW(Shdr))) << ", " << elf_size << "\n"; + + if (!section_header_offset + || !section_header_num_entries + || section_header_offset + section_header_num_entries * sizeof(ElfW(Shdr)) > elf_size) + return; + +// std::cerr << "Section header Ok" << "\n"; + + /// Find symtab, strtab or dyndym, dynstr. + const ElfW(Shdr) * section_headers = reinterpret_cast(mapped_elf + section_header_offset); + + /// The string table with section names. + ElfW(Off) section_names_offset = 0; + const char * section_names = nullptr; + for (size_t section_header_idx = 0; section_header_idx < section_header_num_entries; ++section_header_idx) + { + auto & entry = section_headers[section_header_idx]; + if (entry.sh_type == SHT_STRTAB && elf_header->e_shstrndx == section_header_idx) + { +// std::cerr << "Found section names\n"; + section_names_offset = entry.sh_offset; + if (section_names_offset >= elf_size) + return; + section_names = reinterpret_cast(mapped_elf + section_names_offset); + break; + } + } + + if (!section_names) + return; + + collectSymbolsFromELFSymbolTable( + info, mapped_elf, elf_size, section_headers, section_header_num_entries, + section_names_offset, section_names, SHT_SYMTAB, ".strtab", symbols); + + collectSymbolsFromELFSymbolTable( + info, mapped_elf, elf_size, section_headers, section_header_num_entries, + section_names_offset, section_names, SHT_DYNSYM, ".dynstr", symbols); +} + + +/* Callback for dl_iterate_phdr. + * Is called by dl_iterate_phdr for every loaded shared lib until something + * else than 0 is returned by one call of this function. + */ +int collectSymbols(dl_phdr_info * info, size_t, void * out_symbols) +{ + /* ElfW is a macro that creates proper typenames for the used system architecture + * (e.g. on a 32 bit system, ElfW(Dyn*) becomes "Elf32_Dyn*") + */ + + std::vector & symbols = *reinterpret_cast *>(out_symbols); + + collectSymbolsFromProgramHeaders(info, symbols); + collectSymbolsFromELF(info, symbols); /* Continue iterations */ return 0; From 957b59f0d0fea005206c8f6652936a32a7e27826 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 29 Jul 2019 20:14:53 +0300 Subject: [PATCH 03/21] Cleanups --- dbms/src/Common/Elf.cpp | 121 +++++++++++++++++++++++++++ dbms/src/Common/Elf.h | 57 +++++++++++++ dbms/src/Common/ErrorCodes.cpp | 1 + dbms/src/Common/SymbolIndex.cpp | 139 +++++++------------------------- dbms/src/Common/SymbolIndex.h | 2 +- 5 files changed, 208 insertions(+), 112 deletions(-) create mode 100644 dbms/src/Common/Elf.cpp create mode 100644 dbms/src/Common/Elf.h diff --git a/dbms/src/Common/Elf.cpp b/dbms/src/Common/Elf.cpp new file mode 100644 index 0000000000..85aed3367c --- /dev/null +++ b/dbms/src/Common/Elf.cpp @@ -0,0 +1,121 @@ +#include +#include + +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_ELF; +} + + +Elf::Elf(const std::string & path) + : in(path, 0) +{ + std::cerr << "Processing path " << path << "\n"; + + /// Check if it's an elf. + size = in.buffer().size(); + if (size < sizeof(ElfEhdr)) + throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF); + + mapped = in.buffer().begin(); + header = reinterpret_cast(mapped); + + if (memcmp(header->e_ident, "\x7F""ELF", 4) != 0) + throw Exception("The file is not ELF according to magic", ErrorCodes::CANNOT_PARSE_ELF); + + /// Get section header. + ElfOff section_header_offset = header->e_shoff; + uint16_t section_header_num_entries = header->e_shnum; + + if (!section_header_offset + || !section_header_num_entries + || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > size) + throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF); + + section_headers = reinterpret_cast(mapped + section_header_offset); + + /// The string table with section names. + auto section_names_strtab = findSection([&](const Section & section, size_t idx) + { + return section.header.sh_type == SHT_STRTAB && header->e_shstrndx == idx; + }); + + if (!section_names_strtab) + throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF); + + ElfOff section_names_offset = section_names_strtab->header.sh_offset; + if (section_names_offset >= size) + throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF); + + section_names = reinterpret_cast(mapped + section_names_offset); +} + + +Elf::Section::Section(const ElfShdr & header, const Elf & elf) + : header(header), elf(elf) +{ +} + + +bool Elf::iterateSections(std::function && pred) const +{ + for (size_t idx = 0; idx < header->e_shnum; ++idx) + { + Section section(section_headers[idx], *this); + + /// Sections spans after end of file. + if (section.header.sh_offset + section.header.sh_size > size) + continue; + + if (pred(section, idx)) + return true; + } + return false; +} + + +std::optional Elf::findSection(std::function && pred) const +{ + std::optional result; + + iterateSections([&](const Section & section, size_t idx) + { + if (pred(section, idx)) + { + result.emplace(section); + return true; + } + return false; + }); + + return result; +} + + +const char * Elf::Section::name() const +{ + if (!elf.section_names) + throw Exception("Section names are not initialized", ErrorCodes::CANNOT_PARSE_ELF); + + /// TODO buffer overflow is possible, we may need to check strlen. + return elf.section_names + header.sh_name; +} + + +const char * Elf::Section::begin() const +{ + return elf.mapped + header.sh_offset; +} + +const char * Elf::Section::end() const +{ + return elf.mapped + header.sh_offset + header.sh_size; +} + +} diff --git a/dbms/src/Common/Elf.h b/dbms/src/Common/Elf.h new file mode 100644 index 0000000000..807e265b11 --- /dev/null +++ b/dbms/src/Common/Elf.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +#include +#include +#include + +#include +#include + + +using ElfAddr = ElfW(Addr); +using ElfEhdr = ElfW(Ehdr); +using ElfOff = ElfW(Off); +using ElfPhdr = ElfW(Phdr); +using ElfShdr = ElfW(Shdr); +using ElfSym = ElfW(Sym); + + +namespace DB +{ + +class Elf +{ +public: + struct Section + { + const ElfShdr & header; + const char * name() const; + + const char * begin() const; + const char * end() const; + + Section(const ElfShdr & header, const Elf & elf); + + private: + const Elf & elf; + }; + + Elf(const std::string & path); + + std::optional
findSection(std::function && pred) const; + bool iterateSections(std::function && pred) const; + + const char * end() const { return mapped + size; } + +private: + MMapReadBufferFromFile in; + size_t size; + const char * mapped; + const ElfEhdr * header; + const ElfShdr * section_headers; + const char * section_names = nullptr; +}; + +} diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index e8ee16c567..53f40239d1 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -438,6 +438,7 @@ namespace ErrorCodes extern const int CANNOT_SET_TIMER_PERIOD = 461; extern const int CANNOT_DELETE_TIMER = 462; extern const int CANNOT_FCNTL = 463; + extern const int CANNOT_PARSE_ELF = 464; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Common/SymbolIndex.cpp b/dbms/src/Common/SymbolIndex.cpp index f8893bd4ac..24fc93aec9 100644 --- a/dbms/src/Common/SymbolIndex.cpp +++ b/dbms/src/Common/SymbolIndex.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -162,39 +162,28 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vector & symbols) { - if (symbol_table->sh_offset + symbol_table->sh_size > elf_size - || string_table->sh_offset + string_table->sh_size > elf_size) - return; - /// Iterate symbol table. - const ElfW(Sym) * symbol_table_entry = reinterpret_cast(mapped_elf + symbol_table->sh_offset); - const ElfW(Sym) * symbol_table_end = reinterpret_cast(mapped_elf + symbol_table->sh_offset + symbol_table->sh_size); + const ElfSym * symbol_table_entry = reinterpret_cast(symbol_table.begin()); + const ElfSym * symbol_table_end = reinterpret_cast(symbol_table.end()); -// std::cerr << "Symbol table has: " << (symbol_table_end - symbol_table_entry) << "\n"; - - const char * strings = reinterpret_cast(mapped_elf + string_table->sh_offset); + const char * strings = string_table.begin(); for (; symbol_table_entry < symbol_table_end; ++symbol_table_entry) { if (!symbol_table_entry->st_name || !symbol_table_entry->st_value || !symbol_table_entry->st_size - || string_table->sh_offset + symbol_table_entry->st_name >= elf_size) + || strings + symbol_table_entry->st_name >= elf.end()) continue; -// std::cerr << "Symbol Ok" << "\n"; - /// Find the name in strings table. const char * symbol_name = strings + symbol_table_entry->st_name; -// std::cerr << "Symbol name: " << symbol_name << "\n"; - DB::SymbolIndex::Symbol symbol; symbol.address_begin = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value); symbol.address_end = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value + symbol_table_entry->st_size); @@ -207,45 +196,32 @@ void collectSymbolsFromELFSymbolTable( } -bool collectSymbolsFromELFSymbolTable( +bool searchAndCollectSymbolsFromELFSymbolTable( dl_phdr_info * info, - const char * mapped_elf, - size_t elf_size, - const ElfW(Shdr) * section_headers, - size_t section_header_num_entries, - ElfW(Off) section_names_offset, - const char * section_names, - ElfW(Word) section_header_type, + const DB::Elf & elf, + unsigned section_header_type, const char * string_table_name, std::vector & symbols) { - const ElfW(Shdr) * symbol_table = nullptr; - const ElfW(Shdr) * string_table = nullptr; + std::optional symbol_table; + std::optional string_table; - for (size_t section_header_idx = 0; section_header_idx < section_header_num_entries; ++section_header_idx) - { - auto & entry = section_headers[section_header_idx]; + if (!elf.iterateSections([&](const DB::Elf::Section & section, size_t) + { + if (section.header.sh_type == section_header_type) + symbol_table.emplace(section); + else if (section.header.sh_type == SHT_STRTAB && 0 == strcmp(section.name(), string_table_name)) + string_table.emplace(section); -// std::cerr << entry.sh_type << ", " << (section_names + entry.sh_name) << "\n"; - - if (section_names_offset + entry.sh_name >= elf_size) + if (symbol_table && string_table) + return true; return false; - - if (entry.sh_type == section_header_type) - symbol_table = &entry; - else if (entry.sh_type == SHT_STRTAB && 0 == strcmp(section_names + entry.sh_name, string_table_name)) - string_table = &entry; - - if (symbol_table && string_table) - break; + })) + { + return false; } - if (!symbol_table || !string_table) - return false; - -// std::cerr << "Found tables for " << string_table_name << "\n"; - - collectSymbolsFromELFSymbolTable(info, mapped_elf, elf_size, symbol_table, string_table, symbols); + collectSymbolsFromELFSymbolTable(info, elf, *symbol_table, *string_table, symbols); return true; } @@ -266,69 +242,10 @@ void collectSymbolsFromELF(dl_phdr_info * info, std::vector(mapped_elf); - - if (memcmp(elf_header->e_ident, "\x7F""ELF", 4) != 0) - return; - -// std::cerr << "Header Ok" << "\n"; - - /// Get section header. - ElfW(Off) section_header_offset = elf_header->e_shoff; - uint16_t section_header_num_entries = elf_header->e_shnum; - -// std::cerr << section_header_offset << ", " << section_header_num_entries << ", " << (section_header_num_entries * sizeof(ElfW(Shdr))) << ", " << elf_size << "\n"; - - if (!section_header_offset - || !section_header_num_entries - || section_header_offset + section_header_num_entries * sizeof(ElfW(Shdr)) > elf_size) - return; - -// std::cerr << "Section header Ok" << "\n"; - - /// Find symtab, strtab or dyndym, dynstr. - const ElfW(Shdr) * section_headers = reinterpret_cast(mapped_elf + section_header_offset); - - /// The string table with section names. - ElfW(Off) section_names_offset = 0; - const char * section_names = nullptr; - for (size_t section_header_idx = 0; section_header_idx < section_header_num_entries; ++section_header_idx) - { - auto & entry = section_headers[section_header_idx]; - if (entry.sh_type == SHT_STRTAB && elf_header->e_shstrndx == section_header_idx) - { -// std::cerr << "Found section names\n"; - section_names_offset = entry.sh_offset; - if (section_names_offset >= elf_size) - return; - section_names = reinterpret_cast(mapped_elf + section_names_offset); - break; - } - } - - if (!section_names) - return; - - collectSymbolsFromELFSymbolTable( - info, mapped_elf, elf_size, section_headers, section_header_num_entries, - section_names_offset, section_names, SHT_SYMTAB, ".strtab", symbols); - - collectSymbolsFromELFSymbolTable( - info, mapped_elf, elf_size, section_headers, section_header_num_entries, - section_names_offset, section_names, SHT_DYNSYM, ".dynstr", symbols); + searchAndCollectSymbolsFromELFSymbolTable(info, elf, SHT_SYMTAB, ".strtab", symbols); + searchAndCollectSymbolsFromELFSymbolTable(info, elf, SHT_DYNSYM, ".dynstr", symbols); } diff --git a/dbms/src/Common/SymbolIndex.h b/dbms/src/Common/SymbolIndex.h index b6b8284d2a..41c7a10648 100644 --- a/dbms/src/Common/SymbolIndex.h +++ b/dbms/src/Common/SymbolIndex.h @@ -18,7 +18,7 @@ public: const void * address_begin; const void * address_end; const char * object; - std::string name; /// demangled + std::string name; /// demangled NOTE Can use Arena for strings bool operator< (const Symbol & rhs) const { return address_begin < rhs.address_begin; } bool operator< (const void * addr) const { return address_begin <= addr; } From daa36650fb9e4f43989919efdda65319ffb36ce5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 29 Jul 2019 21:06:39 +0300 Subject: [PATCH 04/21] Imported Dwarf parser from Facebook folly --- dbms/src/Common/Dwarf.cpp | 1097 ++++++++++++++++++++++++++++++++ dbms/src/Common/Dwarf.h | 287 +++++++++ dbms/src/Common/Elf.cpp | 17 +- dbms/src/Common/Elf.h | 8 +- dbms/src/Common/ErrorCodes.cpp | 1 + 5 files changed, 1404 insertions(+), 6 deletions(-) create mode 100644 dbms/src/Common/Dwarf.cpp create mode 100644 dbms/src/Common/Dwarf.h diff --git a/dbms/src/Common/Dwarf.cpp b/dbms/src/Common/Dwarf.cpp new file mode 100644 index 0000000000..45a5116642 --- /dev/null +++ b/dbms/src/Common/Dwarf.cpp @@ -0,0 +1,1097 @@ +/* + * Copyright 2012-present Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** This file was edited for ClickHouse. + */ + +#include + +#include + +#include +#include +#include + + +#define DW_CHILDREN_no 0 +#define DW_FORM_addr 1 +#define DW_FORM_block1 0x0a +#define DW_FORM_block2 3 +#define DW_FORM_block4 4 +#define DW_FORM_block 9 +#define DW_FORM_exprloc 0x18 +#define DW_FORM_data1 0x0b +#define DW_FORM_ref1 0x11 +#define DW_FORM_data2 0x05 +#define DW_FORM_ref2 0x12 +#define DW_FORM_data4 0x06 +#define DW_FORM_ref4 0x13 +#define DW_FORM_data8 0x07 +#define DW_FORM_ref8 0x14 +#define DW_FORM_sdata 0x0d +#define DW_FORM_udata 0x0f +#define DW_FORM_ref_udata 0x15 +#define DW_FORM_flag 0x0c +#define DW_FORM_flag_present 0x19 +#define DW_FORM_sec_offset 0x17 +#define DW_FORM_ref_addr 0x10 +#define DW_FORM_string 0x08 +#define DW_FORM_strp 0x0e +#define DW_FORM_indirect 0x16 +#define DW_TAG_compile_unit 0x11 +#define DW_AT_stmt_list 0x10 +#define DW_AT_comp_dir 0x1b +#define DW_AT_name 0x03 +#define DW_LNE_define_file 0x03 +#define DW_LNS_copy 0x01 +#define DW_LNS_advance_pc 0x02 +#define DW_LNS_advance_line 0x03 +#define DW_LNS_set_file 0x04 +#define DW_LNS_set_column 0x05 +#define DW_LNS_negate_stmt 0x06 +#define DW_LNS_set_basic_block 0x07 +#define DW_LNS_const_add_pc 0x08 +#define DW_LNS_fixed_advance_pc 0x09 +#define DW_LNS_set_prologue_end 0x0a +#define DW_LNS_set_epilogue_begin 0x0b +#define DW_LNS_set_isa 0x0c +#define DW_LNE_end_sequence 0x01 +#define DW_LNE_set_address 0x02 +#define DW_LNE_set_discriminator 0x04 + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_DWARF; +} + + +Dwarf::Dwarf(const Elf & elf) : elf_(&elf) +{ + init(); +} + +Dwarf::Section::Section(std::string_view d) : is64Bit_(false), data_(d) +{ +} + + +#define SAFE_CHECK(cond, message) do { if (!(cond)) throw Exception(message, ErrorCodes::CANNOT_PARSE_DWARF); } while (false) + + +namespace +{ +// All following read* functions read from a std::string_view, advancing the +// std::string_view, and aborting if there's not enough room. + +// Read (bitwise) one object of type T +template +std::enable_if_t, T> read(std::string_view & sp) +{ + SAFE_CHECK(sp.size() >= sizeof(T), "underflow"); + T x; + memcpy(&x, sp.data(), sizeof(T)); + sp.remove_prefix(sizeof(T)); + return x; +} + +// Read ULEB (unsigned) varint value; algorithm from the DWARF spec +uint64_t readULEB(std::string_view & sp, uint8_t & shift, uint8_t & val) +{ + uint64_t r = 0; + shift = 0; + do + { + val = read(sp); + r |= (uint64_t(val & 0x7f) << shift); + shift += 7; + } while (val & 0x80); + return r; +} + +uint64_t readULEB(std::string_view & sp) +{ + uint8_t shift; + uint8_t val; + return readULEB(sp, shift, val); +} + +// Read SLEB (signed) varint value; algorithm from the DWARF spec +int64_t readSLEB(std::string_view & sp) +{ + uint8_t shift; + uint8_t val; + uint64_t r = readULEB(sp, shift, val); + + if (shift < 64 && (val & 0x40)) + { + r |= -(1ULL << shift); // sign extend + } + + return r; +} + +// Read a value of "section offset" type, which may be 4 or 8 bytes +uint64_t readOffset(std::string_view & sp, bool is64Bit) +{ + return is64Bit ? read(sp) : read(sp); +} + +// Read "len" bytes +std::string_view readBytes(std::string_view & sp, uint64_t len) +{ + SAFE_CHECK(len >= sp.size(), "invalid string length"); + std::string_view ret(sp.data(), len); + sp.remove_prefix(len); + return ret; +} + +// Read a null-terminated string +std::string_view readNullTerminated(std::string_view & sp) +{ + const char * p = static_cast(memchr(sp.data(), 0, sp.size())); + SAFE_CHECK(p, "invalid null-terminated string"); + std::string_view ret(sp.data(), p - sp.data()); + sp = std::string_view(p + 1, sp.size()); + return ret; +} + +// Skip over padding until sp.data() - start is a multiple of alignment +void skipPadding(std::string_view & sp, const char * start, size_t alignment) +{ + size_t remainder = (sp.data() - start) % alignment; + if (remainder) + { + SAFE_CHECK(alignment - remainder <= sp.size(), "invalid padding"); + sp.remove_prefix(alignment - remainder); + } +} + +// Simplify a path -- as much as we can while not moving data around... +/*void simplifyPath(std::string_view & sp) +{ + // Strip leading slashes and useless patterns (./), leaving one initial + // slash. + for (;;) + { + if (sp.empty()) + { + return; + } + + // Strip leading slashes, leaving one. + while (sp.startsWith("//")) + { + sp.remove_prefix(1); + } + + if (sp.startsWith("/./")) + { + // Note 2, not 3, to keep it absolute + sp.remove_prefix(2); + continue; + } + + if (sp.removePrefix("./")) + { + // Also remove any subsequent slashes to avoid making this path absolute. + while (sp.startsWith('/')) + { + sp.remove_prefix(1); + } + continue; + } + + break; + } + + // Strip trailing slashes and useless patterns (/.). + for (;;) + { + if (sp.empty()) + { + return; + } + + // Strip trailing slashes, except when this is the root path. + while (sp.size() > 1 && sp.removeSuffix('/')) + { + } + + if (sp.removeSuffix("/.")) + { + continue; + } + + break; + } +}*/ + +} + + +Dwarf::Path::Path(std::string_view baseDir, std::string_view subDir, std::string_view file) + : baseDir_(baseDir), subDir_(subDir), file_(file) +{ + using std::swap; + + // Normalize + if (file_.empty()) + { + baseDir_ = {}; + subDir_ = {}; + return; + } + + if (file_[0] == '/') + { + // file_ is absolute + baseDir_ = {}; + subDir_ = {}; + } + + if (!subDir_.empty() && subDir_[0] == '/') + { + baseDir_ = {}; // subDir_ is absolute + } + +// simplifyPath(baseDir_); +// simplifyPath(subDir_); +// simplifyPath(file_); + + // Make sure it's never the case that baseDir_ is empty, but subDir_ isn't. + if (baseDir_.empty()) + { + swap(baseDir_, subDir_); + } +} + +size_t Dwarf::Path::size() const +{ + size_t size = 0; + bool needsSlash = false; + + if (!baseDir_.empty()) + { + size += baseDir_.size(); + needsSlash = baseDir_.back() != '/'; + } + + if (!subDir_.empty()) + { + size += needsSlash; + size += subDir_.size(); + needsSlash = subDir_.back() != '/'; + } + + if (!file_.empty()) + { + size += needsSlash; + size += file_.size(); + } + + return size; +} + +size_t Dwarf::Path::toBuffer(char * buf, size_t bufSize) const +{ + size_t totalSize = 0; + bool needsSlash = false; + + auto append = [&](std::string_view sp) + { + if (bufSize >= 2) + { + size_t toCopy = std::min(sp.size(), bufSize - 1); + memcpy(buf, sp.data(), toCopy); + buf += toCopy; + bufSize -= toCopy; + } + totalSize += sp.size(); + }; + + if (!baseDir_.empty()) + { + append(baseDir_); + needsSlash = baseDir_.back() != '/'; + } + if (!subDir_.empty()) + { + if (needsSlash) + { + append("/"); + } + append(subDir_); + needsSlash = subDir_.back() != '/'; + } + if (!file_.empty()) + { + if (needsSlash) + { + append("/"); + } + append(file_); + } + if (bufSize) + { + *buf = '\0'; + } + + SAFE_CHECK(totalSize == size(), "Size mismatch"); + return totalSize; +} + +void Dwarf::Path::toString(std::string & dest) const +{ + size_t initialSize = dest.size(); + dest.reserve(initialSize + size()); + if (!baseDir_.empty()) + { + dest.append(baseDir_.begin(), baseDir_.end()); + } + if (!subDir_.empty()) + { + if (!dest.empty() && dest.back() != '/') + { + dest.push_back('/'); + } + dest.append(subDir_.begin(), subDir_.end()); + } + if (!file_.empty()) + { + if (!dest.empty() && dest.back() != '/') + { + dest.push_back('/'); + } + dest.append(file_.begin(), file_.end()); + } + SAFE_CHECK(dest.size() == initialSize + size(), "Size mismatch"); +} + +// Next chunk in section +bool Dwarf::Section::next(std::string_view & chunk) +{ + chunk = data_; + if (chunk.empty()) + return false; + + // Initial length is a uint32_t value for a 32-bit section, and + // a 96-bit value (0xffffffff followed by the 64-bit length) for a 64-bit + // section. + auto initialLength = read(chunk); + is64Bit_ = (initialLength == uint32_t(-1)); + auto length = is64Bit_ ? read(chunk) : initialLength; + SAFE_CHECK(length <= chunk.size(), "invalid DWARF section"); + chunk = std::string_view(chunk.data(), length); + data_ = std::string_view(chunk.end(), data_.end() - chunk.end()); + return true; +} + +bool Dwarf::getSection(const char * name, std::string_view * section) const +{ + std::optional elf_section = elf_->findSectionByName(name); + if (!elf_section) + return false; + +#ifdef SHF_COMPRESSED + if (elf_section->header.sh_flags & SHF_COMPRESSED) + return false; +#endif + + *section = { elf_section->begin(), elf_section->size()}; + return true; +} + +void Dwarf::init() +{ + // Make sure that all .debug_* sections exist + if (!getSection(".debug_info", &info_) + || !getSection(".debug_abbrev", &abbrev_) + || !getSection(".debug_line", &line_) + || !getSection(".debug_str", &strings_)) + { + elf_ = nullptr; + return; + } + + // Optional: fast address range lookup. If missing .debug_info can + // be used - but it's much slower (linear scan). + getSection(".debug_aranges", &aranges_); +} + +bool Dwarf::readAbbreviation(std::string_view & section, DIEAbbreviation & abbr) +{ + // abbreviation code + abbr.code = readULEB(section); + if (abbr.code == 0) + return false; + + // abbreviation tag + abbr.tag = readULEB(section); + + // does this entry have children? + abbr.hasChildren = (read(section) != DW_CHILDREN_no); + + // attributes + const char * attributeBegin = section.data(); + for (;;) + { + SAFE_CHECK(!section.empty(), "invalid attribute section"); + auto attr = readAttribute(section); + if (attr.name == 0 && attr.form == 0) + break; + } + + abbr.attributes = std::string_view(attributeBegin, section.data() - attributeBegin); + return true; +} + +Dwarf::DIEAbbreviation::Attribute Dwarf::readAttribute(std::string_view & sp) +{ + return {readULEB(sp), readULEB(sp)}; +} + +Dwarf::DIEAbbreviation Dwarf::getAbbreviation(uint64_t code, uint64_t offset) const +{ + // Linear search in the .debug_abbrev section, starting at offset + std::string_view section = abbrev_; + section.remove_prefix(offset); + + Dwarf::DIEAbbreviation abbr; + while (readAbbreviation(section, abbr)) + if (abbr.code == code) + return abbr; + + SAFE_CHECK(false, "could not find abbreviation code"); +} + +Dwarf::AttributeValue Dwarf::readAttributeValue(std::string_view & sp, uint64_t form, bool is64Bit) const +{ + switch (form) + { + case DW_FORM_addr: + return read(sp); + case DW_FORM_block1: + return readBytes(sp, read(sp)); + case DW_FORM_block2: + return readBytes(sp, read(sp)); + case DW_FORM_block4: + return readBytes(sp, read(sp)); + case DW_FORM_block: [[fallthrough]]; + case DW_FORM_exprloc: + return readBytes(sp, readULEB(sp)); + case DW_FORM_data1: [[fallthrough]]; + case DW_FORM_ref1: + return read(sp); + case DW_FORM_data2: [[fallthrough]]; + case DW_FORM_ref2: + return read(sp); + case DW_FORM_data4: [[fallthrough]]; + case DW_FORM_ref4: + return read(sp); + case DW_FORM_data8: [[fallthrough]]; + case DW_FORM_ref8: + return read(sp); + case DW_FORM_sdata: + return readSLEB(sp); + case DW_FORM_udata: [[fallthrough]]; + case DW_FORM_ref_udata: + return readULEB(sp); + case DW_FORM_flag: + return read(sp); + case DW_FORM_flag_present: + return 1; + case DW_FORM_sec_offset: [[fallthrough]]; + case DW_FORM_ref_addr: + return readOffset(sp, is64Bit); + case DW_FORM_string: + return readNullTerminated(sp); + case DW_FORM_strp: + return getStringFromStringSection(readOffset(sp, is64Bit)); + case DW_FORM_indirect: // form is explicitly specified + return readAttributeValue(sp, readULEB(sp), is64Bit); + default: + SAFE_CHECK(false, "invalid attribute form"); + } +} + +std::string_view Dwarf::getStringFromStringSection(uint64_t offset) const +{ + SAFE_CHECK(offset < strings_.size(), "invalid strp offset"); + std::string_view sp(strings_); + sp.remove_prefix(offset); + return readNullTerminated(sp); +} + +/** + * Find @address in .debug_aranges and return the offset in + * .debug_info for compilation unit to which this address belongs. + */ +bool Dwarf::findDebugInfoOffset(uintptr_t address, std::string_view aranges, uint64_t & offset) +{ + Section arangesSection(aranges); + std::string_view chunk; + while (arangesSection.next(chunk)) + { + auto version = read(chunk); + SAFE_CHECK(version == 2, "invalid aranges version"); + + offset = readOffset(chunk, arangesSection.is64Bit()); + auto addressSize = read(chunk); + SAFE_CHECK(addressSize == sizeof(uintptr_t), "invalid address size"); + auto segmentSize = read(chunk); + SAFE_CHECK(segmentSize == 0, "segmented architecture not supported"); + + // Padded to a multiple of 2 addresses. + // Strangely enough, this is the only place in the DWARF spec that requires + // padding. + skipPadding(chunk, aranges.data(), 2 * sizeof(uintptr_t)); + for (;;) + { + auto start = read(chunk); + auto length = read(chunk); + + if (start == 0 && length == 0) + break; + + // Is our address in this range? + if (address >= start && address < start + length) + return true; + } + } + return false; +} + +/** + * Find the @locationInfo for @address in the compilation unit represented + * by the @sp .debug_info entry. + * Returns whether the address was found. + * Advances @sp to the next entry in .debug_info. + */ +bool Dwarf::findLocation(uintptr_t address, std::string_view & infoEntry, LocationInfo & locationInfo) const +{ + // For each compilation unit compiled with a DWARF producer, a + // contribution is made to the .debug_info section of the object + // file. Each such contribution consists of a compilation unit + // header (see Section 7.5.1.1) followed by a single + // DW_TAG_compile_unit or DW_TAG_partial_unit debugging information + // entry, together with its children. + + // 7.5.1.1 Compilation Unit Header + // 1. unit_length (4B or 12B): read by Section::next + // 2. version (2B) + // 3. debug_abbrev_offset (4B or 8B): offset into the .debug_abbrev section + // 4. address_size (1B) + + Section debugInfoSection(infoEntry); + std::string_view chunk; + SAFE_CHECK(debugInfoSection.next(chunk), "invalid debug info"); + + auto version = read(chunk); + SAFE_CHECK(version >= 2 && version <= 4, "invalid info version"); + uint64_t abbrevOffset = readOffset(chunk, debugInfoSection.is64Bit()); + auto addressSize = read(chunk); + SAFE_CHECK(addressSize == sizeof(uintptr_t), "invalid address size"); + + // We survived so far. The first (and only) DIE should be DW_TAG_compile_unit + // NOTE: - binutils <= 2.25 does not issue DW_TAG_partial_unit. + // - dwarf compression tools like `dwz` may generate it. + // TODO(tudorb): Handle DW_TAG_partial_unit? + auto code = readULEB(chunk); + SAFE_CHECK(code != 0, "invalid code"); + auto abbr = getAbbreviation(code, abbrevOffset); + SAFE_CHECK(abbr.tag == DW_TAG_compile_unit, "expecting compile unit entry"); + // Skip children entries, remove_prefix to the next compilation unit entry. + infoEntry.remove_prefix(chunk.end() - infoEntry.begin()); + + // Read attributes, extracting the few we care about + bool foundLineOffset = false; + uint64_t lineOffset = 0; + std::string_view compilationDirectory; + std::string_view mainFileName; + + DIEAbbreviation::Attribute attr; + std::string_view attributes = abbr.attributes; + for (;;) + { + attr = readAttribute(attributes); + if (attr.name == 0 && attr.form == 0) + { + break; + } + auto val = readAttributeValue(chunk, attr.form, debugInfoSection.is64Bit()); + switch (attr.name) + { + case DW_AT_stmt_list: + // Offset in .debug_line for the line number VM program for this + // compilation unit + lineOffset = std::get(val); + foundLineOffset = true; + break; + case DW_AT_comp_dir: + // Compilation directory + compilationDirectory = std::get(val); + break; + case DW_AT_name: + // File name of main file being compiled + mainFileName = std::get(val); + break; + } + } + + if (!mainFileName.empty()) + { + locationInfo.hasMainFile = true; + locationInfo.mainFile = Path(compilationDirectory, "", mainFileName); + } + + if (!foundLineOffset) + { + return false; + } + + std::string_view lineSection(line_); + lineSection.remove_prefix(lineOffset); + LineNumberVM lineVM(lineSection, compilationDirectory); + + // Execute line number VM program to find file and line + locationInfo.hasFileAndLine = lineVM.findAddress(address, locationInfo.file, locationInfo.line); + return locationInfo.hasFileAndLine; +} + +bool Dwarf::findAddress(uintptr_t address, LocationInfo & locationInfo, LocationInfoMode mode) const +{ + locationInfo = LocationInfo(); + + if (mode == LocationInfoMode::DISABLED) + { + return false; + } + + if (!elf_) + { // No file. + return false; + } + + if (!aranges_.empty()) + { + // Fast path: find the right .debug_info entry by looking up the + // address in .debug_aranges. + uint64_t offset = 0; + if (findDebugInfoOffset(address, aranges_, offset)) + { + // Read compilation unit header from .debug_info + std::string_view infoEntry(info_); + infoEntry.remove_prefix(offset); + findLocation(address, infoEntry, locationInfo); + return locationInfo.hasFileAndLine; + } + else if (mode == LocationInfoMode::FAST) + { + // NOTE: Clang (when using -gdwarf-aranges) doesn't generate entries + // in .debug_aranges for some functions, but always generates + // .debug_info entries. Scanning .debug_info is slow, so fall back to + // it only if such behavior is requested via LocationInfoMode. + return false; + } + else + { + SAFE_CHECK(mode == LocationInfoMode::FULL, "unexpected mode"); + // Fall back to the linear scan. + } + } + + // Slow path (linear scan): Iterate over all .debug_info entries + // and look for the address in each compilation unit. + std::string_view infoEntry(info_); + while (!infoEntry.empty() && !locationInfo.hasFileAndLine) + findLocation(address, infoEntry, locationInfo); + + return locationInfo.hasFileAndLine; +} + +Dwarf::LineNumberVM::LineNumberVM(std::string_view data, std::string_view compilationDirectory) + : compilationDirectory_(compilationDirectory) +{ + Section section(data); + SAFE_CHECK(section.next(data_), "invalid line number VM"); + is64Bit_ = section.is64Bit(); + init(); + reset(); +} + +void Dwarf::LineNumberVM::reset() +{ + address_ = 0; + file_ = 1; + line_ = 1; + column_ = 0; + isStmt_ = defaultIsStmt_; + basicBlock_ = false; + endSequence_ = false; + prologueEnd_ = false; + epilogueBegin_ = false; + isa_ = 0; + discriminator_ = 0; +} + +void Dwarf::LineNumberVM::init() +{ + version_ = read(data_); + SAFE_CHECK(version_ >= 2 && version_ <= 4, "invalid version in line number VM"); + uint64_t headerLength = readOffset(data_, is64Bit_); + SAFE_CHECK(headerLength <= data_.size(), "invalid line number VM header length"); + std::string_view header(data_.data(), headerLength); + data_ = std::string_view(header.end(), data_.end() - header.end()); + + minLength_ = read(header); + if (version_ == 4) + { // Version 2 and 3 records don't have this + uint8_t maxOpsPerInstruction = read(header); + SAFE_CHECK(maxOpsPerInstruction == 1, "VLIW not supported"); + } + defaultIsStmt_ = read(header); + lineBase_ = read(header); // yes, signed + lineRange_ = read(header); + opcodeBase_ = read(header); + SAFE_CHECK(opcodeBase_ != 0, "invalid opcode base"); + standardOpcodeLengths_ = reinterpret_cast(header.data()); + header.remove_prefix(opcodeBase_ - 1); + + // We don't want to use heap, so we don't keep an unbounded amount of state. + // We'll just skip over include directories and file names here, and + // we'll loop again when we actually need to retrieve one. + std::string_view sp; + const char * tmp = header.data(); + includeDirectoryCount_ = 0; + while (!(sp = readNullTerminated(header)).empty()) + { + ++includeDirectoryCount_; + } + includeDirectories_ = std::string_view(tmp, header.data() - tmp); + + tmp = header.data(); + FileName fn; + fileNameCount_ = 0; + while (readFileName(header, fn)) + { + ++fileNameCount_; + } + fileNames_ = std::string_view(tmp, header.data() - tmp); +} + +bool Dwarf::LineNumberVM::next(std::string_view & program) +{ + Dwarf::LineNumberVM::StepResult ret; + do + { + ret = step(program); + } while (ret == CONTINUE); + + return (ret == COMMIT); +} + +Dwarf::LineNumberVM::FileName Dwarf::LineNumberVM::getFileName(uint64_t index) const +{ + SAFE_CHECK(index != 0, "invalid file index 0"); + + FileName fn; + if (index <= fileNameCount_) + { + std::string_view fileNames = fileNames_; + for (; index; --index) + { + if (!readFileName(fileNames, fn)) + { + abort(); + } + } + return fn; + } + + index -= fileNameCount_; + + std::string_view program = data_; + for (; index; --index) + { + SAFE_CHECK(nextDefineFile(program, fn), "invalid file index"); + } + + return fn; +} + +std::string_view Dwarf::LineNumberVM::getIncludeDirectory(uint64_t index) const +{ + if (index == 0) + { + return std::string_view(); + } + + SAFE_CHECK(index <= includeDirectoryCount_, "invalid include directory"); + + std::string_view includeDirectories = includeDirectories_; + std::string_view dir; + for (; index; --index) + { + dir = readNullTerminated(includeDirectories); + if (dir.empty()) + { + abort(); // BUG + } + } + + return dir; +} + +bool Dwarf::LineNumberVM::readFileName(std::string_view & program, FileName & fn) +{ + fn.relativeName = readNullTerminated(program); + if (fn.relativeName.empty()) + { + return false; + } + fn.directoryIndex = readULEB(program); + // Skip over file size and last modified time + readULEB(program); + readULEB(program); + return true; +} + +bool Dwarf::LineNumberVM::nextDefineFile(std::string_view & program, FileName & fn) const +{ + while (!program.empty()) + { + auto opcode = read(program); + + if (opcode >= opcodeBase_) + { // special opcode + continue; + } + + if (opcode != 0) + { // standard opcode + // Skip, slurp the appropriate number of LEB arguments + uint8_t argCount = standardOpcodeLengths_[opcode - 1]; + while (argCount--) + { + readULEB(program); + } + continue; + } + + // Extended opcode + auto length = readULEB(program); + // the opcode itself should be included in the length, so length >= 1 + SAFE_CHECK(length != 0, "invalid extended opcode length"); + read(program); // extended opcode + --length; + + if (opcode == DW_LNE_define_file) + { + SAFE_CHECK(readFileName(program, fn), "invalid empty file in DW_LNE_define_file"); + return true; + } + + program.remove_prefix(length); + continue; + } + + return false; +} + +Dwarf::LineNumberVM::StepResult Dwarf::LineNumberVM::step(std::string_view & program) +{ + auto opcode = read(program); + + if (opcode >= opcodeBase_) + { // special opcode + uint8_t adjustedOpcode = opcode - opcodeBase_; + uint8_t opAdvance = adjustedOpcode / lineRange_; + + address_ += minLength_ * opAdvance; + line_ += lineBase_ + adjustedOpcode % lineRange_; + + basicBlock_ = false; + prologueEnd_ = false; + epilogueBegin_ = false; + discriminator_ = 0; + return COMMIT; + } + + if (opcode != 0) + { // standard opcode + // Only interpret opcodes that are recognized by the version we're parsing; + // the others are vendor extensions and we should ignore them. + switch (opcode) + { + case DW_LNS_copy: + basicBlock_ = false; + prologueEnd_ = false; + epilogueBegin_ = false; + discriminator_ = 0; + return COMMIT; + case DW_LNS_advance_pc: + address_ += minLength_ * readULEB(program); + return CONTINUE; + case DW_LNS_advance_line: + line_ += readSLEB(program); + return CONTINUE; + case DW_LNS_set_file: + file_ = readULEB(program); + return CONTINUE; + case DW_LNS_set_column: + column_ = readULEB(program); + return CONTINUE; + case DW_LNS_negate_stmt: + isStmt_ = !isStmt_; + return CONTINUE; + case DW_LNS_set_basic_block: + basicBlock_ = true; + return CONTINUE; + case DW_LNS_const_add_pc: + address_ += minLength_ * ((255 - opcodeBase_) / lineRange_); + return CONTINUE; + case DW_LNS_fixed_advance_pc: + address_ += read(program); + return CONTINUE; + case DW_LNS_set_prologue_end: + if (version_ == 2) + { + break; // not supported in version 2 + } + prologueEnd_ = true; + return CONTINUE; + case DW_LNS_set_epilogue_begin: + if (version_ == 2) + { + break; // not supported in version 2 + } + epilogueBegin_ = true; + return CONTINUE; + case DW_LNS_set_isa: + if (version_ == 2) + { + break; // not supported in version 2 + } + isa_ = readULEB(program); + return CONTINUE; + } + + // Unrecognized standard opcode, slurp the appropriate number of LEB + // arguments. + uint8_t argCount = standardOpcodeLengths_[opcode - 1]; + while (argCount--) + { + readULEB(program); + } + return CONTINUE; + } + + // Extended opcode + auto length = readULEB(program); + // the opcode itself should be included in the length, so length >= 1 + SAFE_CHECK(length != 0, "invalid extended opcode length"); + auto extendedOpcode = read(program); + --length; + + switch (extendedOpcode) + { + case DW_LNE_end_sequence: + return END; + case DW_LNE_set_address: + address_ = read(program); + return CONTINUE; + case DW_LNE_define_file: + // We can't process DW_LNE_define_file here, as it would require us to + // use unbounded amounts of state (ie. use the heap). We'll do a second + // pass (using nextDefineFile()) if necessary. + break; + case DW_LNE_set_discriminator: + discriminator_ = readULEB(program); + return CONTINUE; + } + + // Unrecognized extended opcode + program.remove_prefix(length); + return CONTINUE; +} + +bool Dwarf::LineNumberVM::findAddress(uintptr_t target, Path & file, uint64_t & line) +{ + std::string_view program = data_; + + // Within each sequence of instructions, the address may only increase. + // Unfortunately, within the same compilation unit, sequences may appear + // in any order. So any sequence is a candidate if it starts at an address + // <= the target address, and we know we've found the target address if + // a candidate crosses the target address. + enum State + { + START, + LOW_SEQ, // candidate + HIGH_SEQ + }; + State state = START; + reset(); + + uint64_t prevFile = 0; + uint64_t prevLine = 0; + while (!program.empty()) + { + bool seqEnd = !next(program); + + if (state == START) + { + if (!seqEnd) + { + state = address_ <= target ? LOW_SEQ : HIGH_SEQ; + } + } + + if (state == LOW_SEQ) + { + if (address_ > target) + { + // Found it! Note that ">" is indeed correct (not ">="), as each + // sequence is guaranteed to have one entry past-the-end (emitted by + // DW_LNE_end_sequence) + if (prevFile == 0) + { + return false; + } + auto fn = getFileName(prevFile); + file = Path(compilationDirectory_, getIncludeDirectory(fn.directoryIndex), fn.relativeName); + line = prevLine; + return true; + } + prevFile = file_; + prevLine = line_; + } + + if (seqEnd) + { + state = START; + reset(); + } + } + + return false; +} + +} diff --git a/dbms/src/Common/Dwarf.h b/dbms/src/Common/Dwarf.h new file mode 100644 index 0000000000..48e2392225 --- /dev/null +++ b/dbms/src/Common/Dwarf.h @@ -0,0 +1,287 @@ +#pragma once + +/* + * Copyright 2012-present Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** This file was edited for ClickHouse. + */ + +#include +#include +#include + + +namespace DB +{ + +class Elf; + +/** + * DWARF record parser. + * + * We only implement enough DWARF functionality to convert from PC address + * to file and line number information. + * + * This means (although they're not part of the public API of this class), we + * can parse Debug Information Entries (DIEs), abbreviations, attributes (of + * all forms), and we can interpret bytecode for the line number VM. + * + * We can interpret DWARF records of version 2, 3, or 4, although we don't + * actually support many of the version 4 features (such as VLIW, multiple + * operations per instruction) + * + * Note that the DWARF record parser does not allocate heap memory at all. + * This is on purpose: you can use the parser from + * memory-constrained situations (such as an exception handler for + * std::out_of_memory) If it weren't for this requirement, some things would + * be much simpler: the Path class would be unnecessary and would be replaced + * with a std::string; the list of file names in the line number VM would be + * kept as a vector of strings instead of re-executing the program to look for + * DW_LNE_define_file instructions, etc. + */ +class Dwarf +{ + // Note that Dwarf uses (and returns) std::string_view a lot. + // The std::string_view point within sections in the ELF file, and so will + // be live for as long as the passed-in Elf is live. +public: + /** Create a DWARF parser around an ELF file. */ + explicit Dwarf(const Elf & elf); + + /** + * Represent a file path a s collection of three parts (base directory, + * subdirectory, and file). + */ + class Path + { + public: + Path() {} + + Path(std::string_view baseDir, std::string_view subDir, std::string_view file); + + std::string_view baseDir() const { return baseDir_; } + std::string_view subDir() const { return subDir_; } + std::string_view file() const { return file_; } + + size_t size() const; + + /** + * Copy the Path to a buffer of size bufSize. + * + * toBuffer behaves like snprintf: It will always null-terminate the + * buffer (so it will copy at most bufSize-1 bytes), and it will return + * the number of bytes that would have been written if there had been + * enough room, so, if toBuffer returns a value >= bufSize, the output + * was truncated. + */ + size_t toBuffer(char * buf, size_t bufSize) const; + + void toString(std::string & dest) const; + std::string toString() const + { + std::string s; + toString(s); + return s; + } + + // TODO(tudorb): Implement operator==, operator!=; not as easy as it + // seems as the same path can be represented in multiple ways + private: + std::string_view baseDir_; + std::string_view subDir_; + std::string_view file_; + }; + + enum class LocationInfoMode + { + // Don't resolve location info. + DISABLED, + // Perform CU lookup using .debug_aranges (might be incomplete). + FAST, + // Scan all CU in .debug_info (slow!) on .debug_aranges lookup failure. + FULL, + }; + + struct LocationInfo + { + bool hasMainFile = false; + Path mainFile; + + bool hasFileAndLine = false; + Path file; + uint64_t line = 0; + }; + + /** + * Find the file and line number information corresponding to address. + */ + bool findAddress(uintptr_t address, LocationInfo & info, LocationInfoMode mode) const; + +private: + static bool findDebugInfoOffset(uintptr_t address, std::string_view aranges, uint64_t & offset); + + void init(); + bool findLocation(uintptr_t address, std::string_view & infoEntry, LocationInfo & info) const; + + const Elf * elf_; + + // DWARF section made up of chunks, each prefixed with a length header. + // The length indicates whether the chunk is DWARF-32 or DWARF-64, which + // guides interpretation of "section offset" records. + // (yes, DWARF-32 and DWARF-64 sections may coexist in the same file) + class Section + { + public: + Section() : is64Bit_(false) {} + + explicit Section(std::string_view d); + + // Return next chunk, if any; the 4- or 12-byte length was already + // parsed and isn't part of the chunk. + bool next(std::string_view & chunk); + + // Is the current chunk 64 bit? + bool is64Bit() const { return is64Bit_; } + + private: + // Yes, 32- and 64- bit sections may coexist. Yikes! + bool is64Bit_; + std::string_view data_; + }; + + // Abbreviation for a Debugging Information Entry. + struct DIEAbbreviation + { + uint64_t code; + uint64_t tag; + bool hasChildren; + + struct Attribute + { + uint64_t name; + uint64_t form; + }; + + std::string_view attributes; + }; + + // Interpreter for the line number bytecode VM + class LineNumberVM + { + public: + LineNumberVM(std::string_view data, std::string_view compilationDirectory); + + bool findAddress(uintptr_t address, Path & file, uint64_t & line); + + private: + void init(); + void reset(); + + // Execute until we commit one new row to the line number matrix + bool next(std::string_view & program); + enum StepResult + { + CONTINUE, // Continue feeding opcodes + COMMIT, // Commit new tuple + END, // End of sequence + }; + // Execute one opcode + StepResult step(std::string_view & program); + + struct FileName + { + std::string_view relativeName; + // 0 = current compilation directory + // otherwise, 1-based index in the list of include directories + uint64_t directoryIndex; + }; + // Read one FileName object, remove_prefix sp + static bool readFileName(std::string_view & sp, FileName & fn); + + // Get file name at given index; may be in the initial table + // (fileNames_) or defined using DW_LNE_define_file (and we reexecute + // enough of the program to find it, if so) + FileName getFileName(uint64_t index) const; + + // Get include directory at given index + std::string_view getIncludeDirectory(uint64_t index) const; + + // Execute opcodes until finding a DW_LNE_define_file and return true; + // return file at the end. + bool nextDefineFile(std::string_view & program, FileName & fn) const; + + // Initialization + bool is64Bit_; + std::string_view data_; + std::string_view compilationDirectory_; + + // Header + uint16_t version_; + uint8_t minLength_; + bool defaultIsStmt_; + int8_t lineBase_; + uint8_t lineRange_; + uint8_t opcodeBase_; + const uint8_t * standardOpcodeLengths_; + + std::string_view includeDirectories_; + size_t includeDirectoryCount_; + + std::string_view fileNames_; + size_t fileNameCount_; + + // State machine registers + uint64_t address_; + uint64_t file_; + uint64_t line_; + uint64_t column_; + bool isStmt_; + bool basicBlock_; + bool endSequence_; + bool prologueEnd_; + bool epilogueBegin_; + uint64_t isa_; + uint64_t discriminator_; + }; + + // Read an abbreviation from a std::string_view, return true if at end; remove_prefix sp + static bool readAbbreviation(std::string_view & sp, DIEAbbreviation & abbr); + + // Get abbreviation corresponding to a code, in the chunk starting at + // offset in the .debug_abbrev section + DIEAbbreviation getAbbreviation(uint64_t code, uint64_t offset) const; + + // Read one attribute pair, remove_prefix sp; returns <0, 0> at end. + static DIEAbbreviation::Attribute readAttribute(std::string_view & sp); + + // Read one attribute value, remove_prefix sp + typedef std::variant AttributeValue; + AttributeValue readAttributeValue(std::string_view & sp, uint64_t form, bool is64Bit) const; + + // Get an ELF section by name, return true if found + bool getSection(const char * name, std::string_view * section) const; + + // Get a string from the .debug_str section + std::string_view getStringFromStringSection(uint64_t offset) const; + + std::string_view info_; // .debug_info + std::string_view abbrev_; // .debug_abbrev + std::string_view aranges_; // .debug_aranges + std::string_view line_; // .debug_line + std::string_view strings_; // .debug_str +}; + +} diff --git a/dbms/src/Common/Elf.cpp b/dbms/src/Common/Elf.cpp index 85aed3367c..05767dfe60 100644 --- a/dbms/src/Common/Elf.cpp +++ b/dbms/src/Common/Elf.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include namespace DB @@ -16,8 +16,6 @@ namespace ErrorCodes Elf::Elf(const std::string & path) : in(path, 0) { - std::cerr << "Processing path " << path << "\n"; - /// Check if it's an elf. size = in.buffer().size(); if (size < sizeof(ElfEhdr)) @@ -98,6 +96,12 @@ std::optional Elf::findSection(std::function Elf::findSectionByName(const char * name) const +{ + return findSection([&](const Section & section, size_t) { return 0 == strcmp(name, section.name()); }); +} + + const char * Elf::Section::name() const { if (!elf.section_names) @@ -115,7 +119,12 @@ const char * Elf::Section::begin() const const char * Elf::Section::end() const { - return elf.mapped + header.sh_offset + header.sh_size; + return begin() + size(); +} + +size_t Elf::Section::size() const +{ + return header.sh_size; } } diff --git a/dbms/src/Common/Elf.h b/dbms/src/Common/Elf.h index 807e265b11..f9f615dac3 100644 --- a/dbms/src/Common/Elf.h +++ b/dbms/src/Common/Elf.h @@ -21,7 +21,9 @@ using ElfSym = ElfW(Sym); namespace DB { -class Elf +/** Allow to navigate sections in ELF. + */ +class Elf final { public: struct Section @@ -31,6 +33,7 @@ public: const char * begin() const; const char * end() const; + size_t size() const; Section(const ElfShdr & header, const Elf & elf); @@ -40,8 +43,9 @@ public: Elf(const std::string & path); - std::optional
findSection(std::function && pred) const; bool iterateSections(std::function && pred) const; + std::optional
findSection(std::function && pred) const; + std::optional
findSectionByName(const char * name) const; const char * end() const { return mapped + size; } diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index 53f40239d1..5cb6b7e0a3 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -439,6 +439,7 @@ namespace ErrorCodes extern const int CANNOT_DELETE_TIMER = 462; extern const int CANNOT_FCNTL = 463; extern const int CANNOT_PARSE_ELF = 464; + extern const int CANNOT_PARSE_DWARF = 465; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; From 15dc6d1818f42b0c9f6744bafc58dc0ec78c74b5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 29 Jul 2019 21:38:04 +0300 Subject: [PATCH 05/21] Advancements --- dbms/src/Common/Dwarf.h | 6 +- dbms/src/Common/Elf.cpp | 10 +-- dbms/src/Common/Elf.h | 8 ++- dbms/src/Common/SymbolIndex.cpp | 82 +++++++++++++------------ dbms/src/Common/SymbolIndex.h | 22 +++++-- dbms/src/Common/tests/symbol_index.cpp | 21 +++++-- dbms/src/Functions/symbolizeAddress.cpp | 2 +- 7 files changed, 89 insertions(+), 62 deletions(-) diff --git a/dbms/src/Common/Dwarf.h b/dbms/src/Common/Dwarf.h index 48e2392225..5bc358df86 100644 --- a/dbms/src/Common/Dwarf.h +++ b/dbms/src/Common/Dwarf.h @@ -52,7 +52,7 @@ class Elf; * kept as a vector of strings instead of re-executing the program to look for * DW_LNE_define_file instructions, etc. */ -class Dwarf +class Dwarf final { // Note that Dwarf uses (and returns) std::string_view a lot. // The std::string_view point within sections in the ELF file, and so will @@ -126,8 +126,8 @@ public: }; /** - * Find the file and line number information corresponding to address. - */ + * Find the file and line number information corresponding to address. + */ bool findAddress(uintptr_t address, LocationInfo & info, LocationInfoMode mode) const; private: diff --git a/dbms/src/Common/Elf.cpp b/dbms/src/Common/Elf.cpp index 05767dfe60..bb51b837a1 100644 --- a/dbms/src/Common/Elf.cpp +++ b/dbms/src/Common/Elf.cpp @@ -17,8 +17,8 @@ Elf::Elf(const std::string & path) : in(path, 0) { /// Check if it's an elf. - size = in.buffer().size(); - if (size < sizeof(ElfEhdr)) + elf_size = in.buffer().size(); + if (elf_size < sizeof(ElfEhdr)) throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF); mapped = in.buffer().begin(); @@ -33,7 +33,7 @@ Elf::Elf(const std::string & path) if (!section_header_offset || !section_header_num_entries - || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > size) + || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > elf_size) throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF); section_headers = reinterpret_cast(mapped + section_header_offset); @@ -48,7 +48,7 @@ Elf::Elf(const std::string & path) throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF); ElfOff section_names_offset = section_names_strtab->header.sh_offset; - if (section_names_offset >= size) + if (section_names_offset >= elf_size) throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF); section_names = reinterpret_cast(mapped + section_names_offset); @@ -68,7 +68,7 @@ bool Elf::iterateSections(std::function size) + if (section.header.sh_offset + section.header.sh_size > elf_size) continue; if (pred(section, idx)) diff --git a/dbms/src/Common/Elf.h b/dbms/src/Common/Elf.h index f9f615dac3..7f7fcc538b 100644 --- a/dbms/src/Common/Elf.h +++ b/dbms/src/Common/Elf.h @@ -41,17 +41,19 @@ public: const Elf & elf; }; - Elf(const std::string & path); + explicit Elf(const std::string & path); bool iterateSections(std::function && pred) const; std::optional
findSection(std::function && pred) const; std::optional
findSectionByName(const char * name) const; - const char * end() const { return mapped + size; } + const char * begin() const { return mapped; } + const char * end() const { return mapped + elf_size; } + size_t size() const { return elf_size; } private: MMapReadBufferFromFile in; - size_t size; + size_t elf_size; const char * mapped; const ElfEhdr * header; const ElfShdr * section_headers; diff --git a/dbms/src/Common/SymbolIndex.cpp b/dbms/src/Common/SymbolIndex.cpp index 24fc93aec9..b315abead7 100644 --- a/dbms/src/Common/SymbolIndex.cpp +++ b/dbms/src/Common/SymbolIndex.cpp @@ -24,7 +24,8 @@ namespace /// Based on the code of musl-libc and the answer of Kanalpiroge on /// https://stackoverflow.com/questions/15779185/list-all-the-functions-symbols-on-the-fly-in-c-code-on-a-linux-architecture -void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vector & symbols) +void collectSymbolsFromProgramHeaders(dl_phdr_info * info, + std::vector & symbols) { /* Iterate over all headers of the current shared lib * (first call is for the executable itself) */ @@ -40,8 +41,6 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vector(info->dlpi_addr + info->dlpi_phdr[header_index].p_vaddr); -// std::cerr << "dlpi_addr: " << info->dlpi_addr << "\n"; - /// For unknown reason, addresses are sometimes relative sometimes absolute. auto correct_address = [](ElfW(Addr) base, ElfW(Addr) ptr) { @@ -53,25 +52,17 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vectord_tag != DT_NULL; ++it) - std::cerr << it->d_tag << "\n";*/ - size_t sym_cnt = 0; for (auto it = dyn_begin; it->d_tag != DT_NULL; ++it) { if (it->d_tag == DT_HASH) { const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); - -// std::cerr << it->d_un.d_ptr << ", " << it->d_un.d_val << "\n"; - sym_cnt = hash[1]; break; } else if (it->d_tag == DT_GNU_HASH) { -// std::cerr << it->d_un.d_ptr << ", " << it->d_un.d_val << "\n"; - /// This code based on Musl-libc. const uint32_t * buckets = nullptr; @@ -100,7 +91,6 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vectord_tag != DT_NULL; ++it) { if (it->d_tag == DT_SYMTAB) @@ -141,8 +129,6 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vector(info->dlpi_addr + elf_sym[sym_index].st_value); symbol.address_end = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value + elf_sym[sym_index].st_size); @@ -226,7 +212,9 @@ bool searchAndCollectSymbolsFromELFSymbolTable( } -void collectSymbolsFromELF(dl_phdr_info * info, std::vector & symbols) +void collectSymbolsFromELF(dl_phdr_info * info, + std::vector & symbols, + std::vector & objects) { std::string object_name = info->dlpi_name; @@ -244,6 +232,12 @@ void collectSymbolsFromELF(dl_phdr_info * info, std::vector(info->dlpi_addr); + object.address_end = reinterpret_cast(info->dlpi_addr + elf.size()); + object.name = object_name; + objects.push_back(std::move(object)); + searchAndCollectSymbolsFromELFSymbolTable(info, elf, SHT_SYMTAB, ".strtab", symbols); searchAndCollectSymbolsFromELFSymbolTable(info, elf, SHT_DYNSYM, ".dynstr", symbols); } @@ -253,21 +247,41 @@ void collectSymbolsFromELF(dl_phdr_info * info, std::vector & symbols = *reinterpret_cast *>(out_symbols); + DB::SymbolIndex::Data & data = *reinterpret_cast(data_ptr); - collectSymbolsFromProgramHeaders(info, symbols); - collectSymbolsFromELF(info, symbols); + collectSymbolsFromProgramHeaders(info, data.symbols); + collectSymbolsFromELF(info, data.symbols, data.objects); /* Continue iterations */ return 0; } + +template +const T * find(const void * address, const std::vector & vec) +{ + /// First range that has left boundary greater than address. + + auto it = std::lower_bound(vec.begin(), vec.end(), address, + [](const T & symbol, const void * addr) { return symbol.address_begin <= addr; }); + + if (it == vec.begin()) + return nullptr; + else + --it; /// Last range that has left boundary less or equals than address. + + if (address >= it->address_begin && address < it->address_end) + return &*it; + else + return nullptr; +} + } @@ -276,28 +290,18 @@ namespace DB void SymbolIndex::update() { - dl_iterate_phdr(collectSymbols, &symbols); - std::sort(symbols.begin(), symbols.end()); + dl_iterate_phdr(collectSymbols, &data.symbols); + std::sort(data.symbols.begin(), data.symbols.end(), [](const Symbol & a, const Symbol & b) { return a.address_begin < b.address_begin; }); } -const SymbolIndex::Symbol * SymbolIndex::find(const void * address) const +const SymbolIndex::Symbol * SymbolIndex::findSymbol(const void * address) const { - /// First range that has left boundary greater than address. + return find(address, data.symbols); +} -// std::cerr << "Searching " << address << "\n"; - - auto it = std::lower_bound(symbols.begin(), symbols.end(), address); - if (it == symbols.begin()) - return nullptr; - else - --it; /// Last range that has left boundary less or equals than address. - -// std::cerr << "Range: " << it->address_begin << " ... " << it->address_end << "\n"; - - if (address >= it->address_begin && address < it->address_end) - return &*it; - else - return nullptr; +const SymbolIndex::Object * SymbolIndex::findObject(const void * address) const +{ + return find(address, data.objects); } } diff --git a/dbms/src/Common/SymbolIndex.h b/dbms/src/Common/SymbolIndex.h index 41c7a10648..9d1dceb2c9 100644 --- a/dbms/src/Common/SymbolIndex.h +++ b/dbms/src/Common/SymbolIndex.h @@ -19,21 +19,31 @@ public: const void * address_end; const char * object; std::string name; /// demangled NOTE Can use Arena for strings + }; - bool operator< (const Symbol & rhs) const { return address_begin < rhs.address_begin; } - bool operator< (const void * addr) const { return address_begin <= addr; } + struct Object + { + const void * address_begin; + const void * address_end; + std::string name; }; SymbolIndex() { update(); } void update(); - const Symbol * find(const void * address) const; + const Symbol * findSymbol(const void * address) const; + const Object * findObject(const void * address) const; - auto begin() const { return symbols.cbegin(); } - auto end() const { return symbols.cend(); } + const std::vector & symbols() const { return data.symbols; } + const std::vector & objects() const { return data.objects; } + struct Data + { + std::vector symbols; + std::vector objects; + }; private: - std::vector symbols; + Data data; }; } diff --git a/dbms/src/Common/tests/symbol_index.cpp b/dbms/src/Common/tests/symbol_index.cpp index a9fec7069e..37a044939b 100644 --- a/dbms/src/Common/tests/symbol_index.cpp +++ b/dbms/src/Common/tests/symbol_index.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -18,22 +20,31 @@ int main(int argc, char ** argv) SymbolIndex symbol_index; - for (const auto & symbol : symbol_index) - std::cout << symbol.name << ": " << symbol.address_begin << " ... " << symbol.address_end << "\n"; + for (const auto & elem : symbol_index.objects()) + std::cout << elem.name << ": " << elem.address_begin << " ... " << elem.address_end << "\n"; const void * address = reinterpret_cast(std::stoull(argv[1], nullptr, 16)); - auto symbol = symbol_index.find(address); + auto symbol = symbol_index.findSymbol(address); if (symbol) std::cerr << symbol->name << ": " << symbol->address_begin << " ... " << symbol->address_end << "\n"; else - std::cerr << "Not found\n"; + std::cerr << "SymbolIndex: Not found\n"; Dl_info info; if (dladdr(address, &info) && info.dli_sname) std::cerr << demangle(info.dli_sname) << ": " << info.dli_saddr << "\n"; else - std::cerr << "Not found\n"; + std::cerr << "dladdr: Not found\n"; + + Elf elf("/proc/self/exe"); + Dwarf dwarf(elf); + + Dwarf::LocationInfo location; + if (dwarf.findAddress(uintptr_t(address), location, Dwarf::LocationInfoMode::FULL)) + std::cerr << location.file.toString() << ":" << location.line << "\n"; + else + std::cerr << "Dwarf: Not found\n"; return 0; } diff --git a/dbms/src/Functions/symbolizeAddress.cpp b/dbms/src/Functions/symbolizeAddress.cpp index 65c1aa84d3..b4fef64981 100644 --- a/dbms/src/Functions/symbolizeAddress.cpp +++ b/dbms/src/Functions/symbolizeAddress.cpp @@ -73,7 +73,7 @@ public: for (size_t i = 0; i < input_rows_count; ++i) { - if (const auto * symbol = symbol_index.find(reinterpret_cast(data[i]))) + if (const auto * symbol = symbol_index.findSymbol(reinterpret_cast(data[i]))) result_column->insertDataWithTerminatingZero(symbol->name.data(), symbol->name.size() + 1); else result_column->insertDefault(); From 372c4d89b26b99118cb83aff8ea5880eb7cfea5c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 01:26:44 +0300 Subject: [PATCH 06/21] Enabled line numbers in stack traces --- CMakeLists.txt | 4 + dbms/CMakeLists.txt | 5 + dbms/src/Common/Dwarf.cpp | 64 ------------ dbms/src/Common/Exception.h | 2 +- dbms/src/Common/QueryProfiler.cpp | 2 +- .../src => dbms/src/Common}/StackTrace.cpp | 99 +++++++++---------- .../common => dbms/src/Common}/StackTrace.h | 0 dbms/src/Common/SymbolIndex.cpp | 64 ++++++------ dbms/src/Common/SymbolIndex.h | 18 ++-- dbms/src/Common/TraceCollector.cpp | 2 +- dbms/src/Common/tests/symbol_index.cpp | 8 +- dbms/src/Functions/registerFunctions.cpp | 4 +- .../registerFunctionsIntrospection.cpp | 16 +++ dbms/src/Functions/symbolizeAddress.cpp | 5 +- dbms/src/Interpreters/Context.cpp | 2 +- libs/libcommon/CMakeLists.txt | 10 -- libs/libdaemon/src/BaseDaemon.cpp | 2 +- 17 files changed, 130 insertions(+), 177 deletions(-) rename {libs/libcommon/src => dbms/src/Common}/StackTrace.cpp (83%) rename {libs/libcommon/include/common => dbms/src/Common}/StackTrace.h (100%) create mode 100644 dbms/src/Functions/registerFunctionsIntrospection.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index a2cc5f15ac..df711a87a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,10 @@ endif () if (COMPILER_CLANG) # clang: warning: argument unused during compilation: '-specs=/usr/share/dpkg/no-pie-compile.specs' [-Wunused-command-line-argument] set (COMMON_WARNING_FLAGS "${COMMON_WARNING_FLAGS} -Wno-unused-command-line-argument") + # generate ranges for fast "addr2line" search + if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") + set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges") + endif () endif () option (ENABLE_TESTS "Enables tests" ON) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index c8056ada4e..99db33de3f 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -159,6 +159,11 @@ if (OS_FREEBSD) target_compile_definitions (clickhouse_common_io PUBLIC CLOCK_MONOTONIC_COARSE=CLOCK_MONOTONIC_FAST) endif () +if (USE_UNWIND) + target_compile_definitions (clickhouse_common_io PRIVATE USE_UNWIND=1) + target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${UNWIND_INCLUDE_DIR}) +endif () + add_subdirectory(src/Common/ZooKeeper) add_subdirectory(src/Common/Config) diff --git a/dbms/src/Common/Dwarf.cpp b/dbms/src/Common/Dwarf.cpp index 45a5116642..798eb08cc5 100644 --- a/dbms/src/Common/Dwarf.cpp +++ b/dbms/src/Common/Dwarf.cpp @@ -183,66 +183,6 @@ void skipPadding(std::string_view & sp, const char * start, size_t alignment) } } -// Simplify a path -- as much as we can while not moving data around... -/*void simplifyPath(std::string_view & sp) -{ - // Strip leading slashes and useless patterns (./), leaving one initial - // slash. - for (;;) - { - if (sp.empty()) - { - return; - } - - // Strip leading slashes, leaving one. - while (sp.startsWith("//")) - { - sp.remove_prefix(1); - } - - if (sp.startsWith("/./")) - { - // Note 2, not 3, to keep it absolute - sp.remove_prefix(2); - continue; - } - - if (sp.removePrefix("./")) - { - // Also remove any subsequent slashes to avoid making this path absolute. - while (sp.startsWith('/')) - { - sp.remove_prefix(1); - } - continue; - } - - break; - } - - // Strip trailing slashes and useless patterns (/.). - for (;;) - { - if (sp.empty()) - { - return; - } - - // Strip trailing slashes, except when this is the root path. - while (sp.size() > 1 && sp.removeSuffix('/')) - { - } - - if (sp.removeSuffix("/.")) - { - continue; - } - - break; - } -}*/ - } @@ -271,10 +211,6 @@ Dwarf::Path::Path(std::string_view baseDir, std::string_view subDir, std::string baseDir_ = {}; // subDir_ is absolute } -// simplifyPath(baseDir_); -// simplifyPath(subDir_); -// simplifyPath(file_); - // Make sure it's never the case that baseDir_ is empty, but subDir_ isn't. if (baseDir_.empty()) { diff --git a/dbms/src/Common/Exception.h b/dbms/src/Common/Exception.h index ee89796222..6b0656f482 100644 --- a/dbms/src/Common/Exception.h +++ b/dbms/src/Common/Exception.h @@ -6,7 +6,7 @@ #include -#include +#include namespace Poco { class Logger; } diff --git a/dbms/src/Common/QueryProfiler.cpp b/dbms/src/Common/QueryProfiler.cpp index 5aafa35df9..08399a49d2 100644 --- a/dbms/src/Common/QueryProfiler.cpp +++ b/dbms/src/Common/QueryProfiler.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/libs/libcommon/src/StackTrace.cpp b/dbms/src/Common/StackTrace.cpp similarity index 83% rename from libs/libcommon/src/StackTrace.cpp rename to dbms/src/Common/StackTrace.cpp index 8323a737fd..f842b71d15 100644 --- a/libs/libcommon/src/StackTrace.cpp +++ b/dbms/src/Common/StackTrace.cpp @@ -1,16 +1,16 @@ -#include #include #include -#include -#include -#include -#include +#include +#include +#include +#include + +#include +#include +#include +#include -#if USE_UNWIND -#define UNW_LOCAL_ONLY -#include -#endif std::string signalToErrorMessage(int sig, const siginfo_t & info, const ucontext_t & context) { @@ -168,9 +168,9 @@ void * getCallerAddress(const ucontext_t & context) #endif #elif defined(__aarch64__) return reinterpret_cast(context.uc_mcontext.pc); -#endif - +#else return nullptr; +#endif } StackTrace::StackTrace() @@ -195,6 +195,12 @@ StackTrace::StackTrace(NoCapture) { } + +#if USE_UNWIND +extern "C" int unw_backtrace(void **, int); +#endif + + void StackTrace::tryCapture() { size = 0; @@ -227,50 +233,43 @@ std::string StackTrace::toStringImpl(const Frames & frames, size_t size) if (size == 0) return ""; - char ** symbols = backtrace_symbols(frames.data(), size); - if (!symbols) - return ""; + const DB::SymbolIndex & symbol_index = DB::SymbolIndex::instance(); + std::unordered_map dwarfs; - std::stringstream backtrace; - try + std::stringstream out; + + for (size_t i = 0; i < size; ++i) { - for (size_t i = 0; i < size; i++) + out << "#" << i << " " << frames[i] << " "; + auto symbol = symbol_index.findSymbol(frames[i]); + if (symbol) { - /// We do "demangling" of names. The name is in parenthesis, before the '+' character. - - char * name_start = nullptr; - char * name_end = nullptr; - std::string demangled_name; int status = 0; - - if (nullptr != (name_start = strchr(symbols[i], '(')) - && nullptr != (name_end = strchr(name_start, '+'))) - { - ++name_start; - *name_end = '\0'; - demangled_name = demangle(name_start, status); - *name_end = '+'; - } - - backtrace << i << ". "; - - if (0 == status && name_start && name_end) - { - backtrace.write(symbols[i], name_start - symbols[i]); - backtrace << demangled_name << name_end; - } - else - backtrace << symbols[i]; - - backtrace << std::endl; + out << demangle(symbol->name, status); } - } - catch (...) - { - free(symbols); - throw; + else + out << "?"; + + out << " "; + + if (auto object = symbol_index.findObject(frames[i])) + { + if (std::filesystem::exists(object->name)) + { + auto dwarf_it = dwarfs.try_emplace(object->name, *object->elf).first; + + DB::Dwarf::LocationInfo location; + if (dwarf_it->second.findAddress(uintptr_t(object->address_begin) + uintptr_t(frames[i]), location, DB::Dwarf::LocationInfoMode::FAST)) + out << location.file.toString() << ":" << location.line; + else + out << object->name; + } + } + else + out << "?"; + + out << "\n"; } - free(symbols); - return backtrace.str(); + return out.str(); } diff --git a/libs/libcommon/include/common/StackTrace.h b/dbms/src/Common/StackTrace.h similarity index 100% rename from libs/libcommon/include/common/StackTrace.h rename to dbms/src/Common/StackTrace.h diff --git a/dbms/src/Common/SymbolIndex.cpp b/dbms/src/Common/SymbolIndex.cpp index b315abead7..ff04ea35ea 100644 --- a/dbms/src/Common/SymbolIndex.cpp +++ b/dbms/src/Common/SymbolIndex.cpp @@ -1,6 +1,4 @@ #include -#include -#include #include #include @@ -11,6 +9,9 @@ #include +namespace DB +{ + namespace { @@ -25,7 +26,7 @@ namespace /// Based on the code of musl-libc and the answer of Kanalpiroge on /// https://stackoverflow.com/questions/15779185/list-all-the-functions-symbols-on-the-fly-in-c-code-on-a-linux-architecture void collectSymbolsFromProgramHeaders(dl_phdr_info * info, - std::vector & symbols) + std::vector & symbols) { /* Iterate over all headers of the current shared lib * (first call is for the executable itself) */ @@ -129,13 +130,10 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, if (!sym_name) continue; - DB::SymbolIndex::Symbol symbol; + SymbolIndex::Symbol symbol; symbol.address_begin = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value); symbol.address_end = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value + elf_sym[sym_index].st_size); - int unused = 0; - symbol.name = demangle(sym_name, unused); - symbol.object = info->dlpi_name; - + symbol.name = sym_name; symbols.push_back(std::move(symbol)); } @@ -148,10 +146,10 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, void collectSymbolsFromELFSymbolTable( dl_phdr_info * info, - const DB::Elf & elf, - const DB::Elf::Section & symbol_table, - const DB::Elf::Section & string_table, - std::vector & symbols) + const Elf & elf, + const Elf::Section & symbol_table, + const Elf::Section & string_table, + std::vector & symbols) { /// Iterate symbol table. const ElfSym * symbol_table_entry = reinterpret_cast(symbol_table.begin()); @@ -170,13 +168,13 @@ void collectSymbolsFromELFSymbolTable( /// Find the name in strings table. const char * symbol_name = strings + symbol_table_entry->st_name; - DB::SymbolIndex::Symbol symbol; + if (!symbol_name) + continue; + + SymbolIndex::Symbol symbol; symbol.address_begin = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value); symbol.address_end = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value + symbol_table_entry->st_size); - int unused = 0; - symbol.name = demangle(symbol_name, unused); - symbol.object = info->dlpi_name; - + symbol.name = symbol_name; symbols.push_back(std::move(symbol)); } } @@ -184,15 +182,15 @@ void collectSymbolsFromELFSymbolTable( bool searchAndCollectSymbolsFromELFSymbolTable( dl_phdr_info * info, - const DB::Elf & elf, + const Elf & elf, unsigned section_header_type, const char * string_table_name, - std::vector & symbols) + std::vector & symbols) { - std::optional symbol_table; - std::optional string_table; + std::optional symbol_table; + std::optional string_table; - if (!elf.iterateSections([&](const DB::Elf::Section & section, size_t) + if (!elf.iterateSections([&](const Elf::Section & section, size_t) { if (section.header.sh_type == section_header_type) symbol_table.emplace(section); @@ -213,8 +211,8 @@ bool searchAndCollectSymbolsFromELFSymbolTable( void collectSymbolsFromELF(dl_phdr_info * info, - std::vector & symbols, - std::vector & objects) + std::vector & symbols, + std::vector & objects) { std::string object_name = info->dlpi_name; @@ -230,16 +228,17 @@ void collectSymbolsFromELF(dl_phdr_info * info, if (ec) return; - DB::Elf elf(object_name); - - DB::SymbolIndex::Object object; + SymbolIndex::Object object; + object.elf = std::make_unique(object_name); object.address_begin = reinterpret_cast(info->dlpi_addr); - object.address_end = reinterpret_cast(info->dlpi_addr + elf.size()); + object.address_end = reinterpret_cast(info->dlpi_addr + object.elf->size()); object.name = object_name; objects.push_back(std::move(object)); - searchAndCollectSymbolsFromELFSymbolTable(info, elf, SHT_SYMTAB, ".strtab", symbols); - searchAndCollectSymbolsFromELFSymbolTable(info, elf, SHT_DYNSYM, ".dynstr", symbols); + searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_SYMTAB, ".strtab", symbols); + + /// Unneeded because they were parsed from "program headers" of loaded objects. + //searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_DYNSYM, ".dynstr", symbols); } @@ -253,7 +252,7 @@ int collectSymbols(dl_phdr_info * info, size_t, void * data_ptr) * (e.g. on a 32 bit system, ElfW(Dyn*) becomes "Elf32_Dyn*") */ - DB::SymbolIndex::Data & data = *reinterpret_cast(data_ptr); + SymbolIndex::Data & data = *reinterpret_cast(data_ptr); collectSymbolsFromProgramHeaders(info, data.symbols); collectSymbolsFromELF(info, data.symbols, data.objects); @@ -285,9 +284,6 @@ const T * find(const void * address, const std::vector & vec) } -namespace DB -{ - void SymbolIndex::update() { dl_iterate_phdr(collectSymbols, &data.symbols); diff --git a/dbms/src/Common/SymbolIndex.h b/dbms/src/Common/SymbolIndex.h index 9d1dceb2c9..41a773f5f4 100644 --- a/dbms/src/Common/SymbolIndex.h +++ b/dbms/src/Common/SymbolIndex.h @@ -2,6 +2,8 @@ #include #include +#include +#include namespace DB @@ -9,16 +11,20 @@ namespace DB /** Allow to quickly find symbol name from address. * Used as a replacement for "dladdr" function which is extremely slow. + * It works better than "dladdr" because it also allows to search private symbols, that are not participated in shared linking. */ -class SymbolIndex +class SymbolIndex : public ext::singleton { +protected: + friend class ext::singleton; + SymbolIndex() { update(); } + public: struct Symbol { const void * address_begin; const void * address_end; - const char * object; - std::string name; /// demangled NOTE Can use Arena for strings + const char * name; }; struct Object @@ -26,11 +32,9 @@ public: const void * address_begin; const void * address_end; std::string name; + std::unique_ptr elf; }; - SymbolIndex() { update(); } - void update(); - const Symbol * findSymbol(const void * address) const; const Object * findObject(const void * address) const; @@ -44,6 +48,8 @@ public: }; private: Data data; + + void update(); }; } diff --git a/dbms/src/Common/TraceCollector.cpp b/dbms/src/Common/TraceCollector.cpp index e66a580289..13d2061c81 100644 --- a/dbms/src/Common/TraceCollector.cpp +++ b/dbms/src/Common/TraceCollector.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/Common/tests/symbol_index.cpp b/dbms/src/Common/tests/symbol_index.cpp index 37a044939b..c6a22b1266 100644 --- a/dbms/src/Common/tests/symbol_index.cpp +++ b/dbms/src/Common/tests/symbol_index.cpp @@ -18,9 +18,9 @@ int main(int argc, char ** argv) return 1; } - SymbolIndex symbol_index; + const SymbolIndex & symbol_index = SymbolIndex::instance(); - for (const auto & elem : symbol_index.objects()) + for (const auto & elem : symbol_index.symbols()) std::cout << elem.name << ": " << elem.address_begin << " ... " << elem.address_end << "\n"; const void * address = reinterpret_cast(std::stoull(argv[1], nullptr, 16)); @@ -41,10 +41,12 @@ int main(int argc, char ** argv) Dwarf dwarf(elf); Dwarf::LocationInfo location; - if (dwarf.findAddress(uintptr_t(address), location, Dwarf::LocationInfoMode::FULL)) + if (dwarf.findAddress(uintptr_t(address), location, Dwarf::LocationInfoMode::FAST)) std::cerr << location.file.toString() << ":" << location.line << "\n"; else std::cerr << "Dwarf: Not found\n"; + std::cerr << StackTrace().toString() << "\n"; + return 0; } diff --git a/dbms/src/Functions/registerFunctions.cpp b/dbms/src/Functions/registerFunctions.cpp index 178f085e1a..eba9a96e5e 100644 --- a/dbms/src/Functions/registerFunctions.cpp +++ b/dbms/src/Functions/registerFunctions.cpp @@ -40,7 +40,7 @@ void registerFunctionsIntrospection(FunctionFactory &); void registerFunctionsNull(FunctionFactory &); void registerFunctionsFindCluster(FunctionFactory &); void registerFunctionsJSON(FunctionFactory &); -void registerFunctionSymbolizeAddress(FunctionFactory &); +void registerFunctionsIntrospection(FunctionFactory &); void registerFunctions() { @@ -79,7 +79,7 @@ void registerFunctions() registerFunctionsNull(factory); registerFunctionsFindCluster(factory); registerFunctionsJSON(factory); - registerFunctionSymbolizeAddress(factory); + registerFunctionsIntrospection(factory); } } diff --git a/dbms/src/Functions/registerFunctionsIntrospection.cpp b/dbms/src/Functions/registerFunctionsIntrospection.cpp new file mode 100644 index 0000000000..0797d21c36 --- /dev/null +++ b/dbms/src/Functions/registerFunctionsIntrospection.cpp @@ -0,0 +1,16 @@ +namespace DB +{ + +class FunctionFactory; + +void registerFunctionSymbolizeAddress(FunctionFactory & factory); +void registerFunctionDemangle(FunctionFactory & factory); + +void registerFunctionsIntrospection(FunctionFactory & factory) +{ + registerFunctionSymbolizeAddress(factory); + registerFunctionDemangle(factory); +} + +} + diff --git a/dbms/src/Functions/symbolizeAddress.cpp b/dbms/src/Functions/symbolizeAddress.cpp index b4fef64981..454fc94b7b 100644 --- a/dbms/src/Functions/symbolizeAddress.cpp +++ b/dbms/src/Functions/symbolizeAddress.cpp @@ -15,7 +15,6 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int SIZES_OF_ARRAYS_DOESNT_MATCH; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } @@ -60,7 +59,7 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override { - static SymbolIndex symbol_index; + const SymbolIndex & symbol_index = SymbolIndex::instance(); const ColumnPtr & column = block.getByPosition(arguments[0]).column; const ColumnUInt64 * column_concrete = checkAndGetColumn(column.get()); @@ -74,7 +73,7 @@ public: for (size_t i = 0; i < input_rows_count; ++i) { if (const auto * symbol = symbol_index.findSymbol(reinterpret_cast(data[i]))) - result_column->insertDataWithTerminatingZero(symbol->name.data(), symbol->name.size() + 1); + result_column->insertDataWithTerminatingZero(symbol->name, strlen(symbol->name) + 1); else result_column->insertDefault(); } diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index cec36f4246..fc2ada171d 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 8ebd9bddc8..ce8c580161 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -23,12 +23,10 @@ add_library (common src/getThreadNumber.cpp src/sleep.cpp src/argsToConfig.cpp - src/StackTrace.cpp src/Pipe.cpp src/phdr_cache.cpp include/common/SimpleCache.h - include/common/StackTrace.h include/common/Types.h include/common/DayNum.h include/common/DateLUT.h @@ -68,14 +66,6 @@ add_library (common ${CONFIG_COMMON}) -if (USE_UNWIND) - target_compile_definitions (common PRIVATE USE_UNWIND=1) - target_include_directories (common BEFORE PRIVATE ${UNWIND_INCLUDE_DIR}) - if (NOT USE_INTERNAL_UNWIND_LIBRARY_FOR_EXCEPTION_HANDLING) - target_link_libraries (common PRIVATE ${UNWIND_LIBRARY}) - endif () -endif () - # When testing for memory leaks with Valgrind, dont link tcmalloc or jemalloc. if (USE_JEMALLOC) diff --git a/libs/libdaemon/src/BaseDaemon.cpp b/libs/libdaemon/src/BaseDaemon.cpp index aa4993acea..16bcb132d3 100644 --- a/libs/libdaemon/src/BaseDaemon.cpp +++ b/libs/libdaemon/src/BaseDaemon.cpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include From ad3f2066d95b4b5e11f8ff62e75b8ed8febaa77d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 01:27:02 +0300 Subject: [PATCH 07/21] Added missing file --- dbms/src/Functions/demange.cpp | 87 ++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 dbms/src/Functions/demange.cpp diff --git a/dbms/src/Functions/demange.cpp b/dbms/src/Functions/demange.cpp new file mode 100644 index 0000000000..8249bf387f --- /dev/null +++ b/dbms/src/Functions/demange.cpp @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +class FunctionDemangle : public IFunction +{ +public: + static constexpr auto name = "demangle"; + static FunctionPtr create(const Context &) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override + { + return 1; + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() != 1) + throw Exception("Function " + getName() + " needs exactly one argument; passed " + + toString(arguments.size()) + ".", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + const auto & type = arguments[0].type; + + if (!WhichDataType(type.get()).isString()) + throw Exception("The only argument for function " + getName() + " must be String. Found " + + type->getName() + " instead.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + bool useDefaultImplementationForConstants() const override + { + return true; + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override + { + const ColumnPtr & column = block.getByPosition(arguments[0]).column; + const ColumnString * column_concrete = checkAndGetColumn(column.get()); + + if (!column_concrete) + throw Exception("Illegal column " + column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + + auto result_column = ColumnString::create(); + + for (size_t i = 0; i < input_rows_count; ++i) + { + StringRef source = column_concrete->getDataAt(i); + int status = 0; + result_column.insertData(demangle(source, status)); + } + + block.getByPosition(result).column = std::move(result_column); + } +}; + +void registerFunctionDemangle(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + From 10439bc010ef4a5c7e20feb9ed12e0e2055ce05a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 01:33:28 +0300 Subject: [PATCH 08/21] Addition to prev. revision --- dbms/src/Functions/demange.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/demange.cpp b/dbms/src/Functions/demange.cpp index 8249bf387f..eb5b0001e1 100644 --- a/dbms/src/Functions/demange.cpp +++ b/dbms/src/Functions/demange.cpp @@ -71,7 +71,7 @@ public: { StringRef source = column_concrete->getDataAt(i); int status = 0; - result_column.insertData(demangle(source, status)); + result_column.insertData(demangle(source.data, status)); } block.getByPosition(result).column = std::move(result_column); From a05c6026dc562e8aa298f3c6e8d98bf2bdb83ef2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 01:34:05 +0300 Subject: [PATCH 09/21] Addition to prev. revision --- dbms/src/Functions/demange.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/demange.cpp b/dbms/src/Functions/demange.cpp index eb5b0001e1..cc27cbf2b1 100644 --- a/dbms/src/Functions/demange.cpp +++ b/dbms/src/Functions/demange.cpp @@ -71,7 +71,7 @@ public: { StringRef source = column_concrete->getDataAt(i); int status = 0; - result_column.insertData(demangle(source.data, status)); + result_column->insertData(demangle(source.data, status)); } block.getByPosition(result).column = std::move(result_column); From 0cbd4f68ce5441a4b1f8f46b3f3ba59378bde13b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 01:37:26 +0300 Subject: [PATCH 10/21] Addition to prev. revision --- dbms/src/Functions/demange.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/Functions/demange.cpp b/dbms/src/Functions/demange.cpp index cc27cbf2b1..2cc04cd452 100644 --- a/dbms/src/Functions/demange.cpp +++ b/dbms/src/Functions/demange.cpp @@ -71,7 +71,8 @@ public: { StringRef source = column_concrete->getDataAt(i); int status = 0; - result_column->insertData(demangle(source.data, status)); + std::string result = demangle(source.data, status); + result_column->insertDataWithTerminatingZero(result.data(), result.size() + 1); } block.getByPosition(result).column = std::move(result_column); From 97ac56139b1ba3094e35c1b4ebf907c701ee35c8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 01:50:39 +0300 Subject: [PATCH 11/21] Addition to prev. revision --- dbms/src/Functions/demange.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/demange.cpp b/dbms/src/Functions/demange.cpp index 2cc04cd452..aea14f5bc7 100644 --- a/dbms/src/Functions/demange.cpp +++ b/dbms/src/Functions/demange.cpp @@ -71,8 +71,8 @@ public: { StringRef source = column_concrete->getDataAt(i); int status = 0; - std::string result = demangle(source.data, status); - result_column->insertDataWithTerminatingZero(result.data(), result.size() + 1); + std::string demangled = demangle(source.data, status); + result_column->insertDataWithTerminatingZero(demangled.data(), demangled.size() + 1); } block.getByPosition(result).column = std::move(result_column); From 256e260693530e01bd990bdc509508e57ad0c770 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 02:37:50 +0300 Subject: [PATCH 12/21] Added one more function for introspection --- dbms/src/Common/StackTrace.cpp | 10 +- dbms/src/Functions/addressToLine.cpp | 146 ++++++++++++++++++ .../registerFunctionsIntrospection.cpp | 2 + 3 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 dbms/src/Functions/addressToLine.cpp diff --git a/dbms/src/Common/StackTrace.cpp b/dbms/src/Common/StackTrace.cpp index f842b71d15..30fb66f218 100644 --- a/dbms/src/Common/StackTrace.cpp +++ b/dbms/src/Common/StackTrace.cpp @@ -240,8 +240,10 @@ std::string StackTrace::toStringImpl(const Frames & frames, size_t size) for (size_t i = 0; i < size; ++i) { - out << "#" << i << " " << frames[i] << " "; - auto symbol = symbol_index.findSymbol(frames[i]); + const void * addr = frames[i]; + + out << "#" << i << " " << addr << " "; + auto symbol = symbol_index.findSymbol(addr); if (symbol) { int status = 0; @@ -252,14 +254,14 @@ std::string StackTrace::toStringImpl(const Frames & frames, size_t size) out << " "; - if (auto object = symbol_index.findObject(frames[i])) + if (auto object = symbol_index.findObject(addr)) { if (std::filesystem::exists(object->name)) { auto dwarf_it = dwarfs.try_emplace(object->name, *object->elf).first; DB::Dwarf::LocationInfo location; - if (dwarf_it->second.findAddress(uintptr_t(object->address_begin) + uintptr_t(frames[i]), location, DB::Dwarf::LocationInfoMode::FAST)) + if (dwarf_it->second.findAddress(uintptr_t(addr) - uintptr_t(object->address_begin), location, DB::Dwarf::LocationInfoMode::FAST)) out << location.file.toString() << ":" << location.line; else out << object->name; diff --git a/dbms/src/Functions/addressToLine.cpp b/dbms/src/Functions/addressToLine.cpp new file mode 100644 index 0000000000..0e3c4fe65a --- /dev/null +++ b/dbms/src/Functions/addressToLine.cpp @@ -0,0 +1,146 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +class FunctionAddressToLine : public IFunction +{ +public: + static constexpr auto name = "addressToLine"; + static FunctionPtr create(const Context &) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override + { + return 1; + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() != 1) + throw Exception("Function " + getName() + " needs exactly one argument; passed " + + toString(arguments.size()) + ".", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + const auto & type = arguments[0].type; + + if (!WhichDataType(type.get()).isUInt64()) + throw Exception("The only argument for function " + getName() + " must be UInt64. Found " + + type->getName() + " instead.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + bool useDefaultImplementationForConstants() const override + { + return true; + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override + { + const ColumnPtr & column = block.getByPosition(arguments[0]).column; + const ColumnUInt64 * column_concrete = checkAndGetColumn(column.get()); + + if (!column_concrete) + throw Exception("Illegal column " + column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + + const typename ColumnVector::Container & data = column_concrete->getData(); + auto result_column = ColumnString::create(); + + for (size_t i = 0; i < input_rows_count; ++i) + { + StringRef res_str = implCached(data[i]); + result_column->insertData(res_str.data, res_str.size); + } + + block.getByPosition(result).column = std::move(result_column); + } + +private: + std::mutex mutex; + Arena arena; + using Map = HashMap; + Map map; + std::unordered_map dwarfs; + + StringRef impl(uintptr_t addr) + { + const SymbolIndex & symbol_index = SymbolIndex::instance(); + + if (auto object = symbol_index.findObject(reinterpret_cast(addr))) + { + auto dwarf_it = dwarfs.try_emplace(object->name, *object->elf).first; + if (!std::filesystem::exists(object->name)) + return {}; + + Dwarf::LocationInfo location; + if (dwarf_it->second.findAddress(addr - uintptr_t(object->address_begin), location, Dwarf::LocationInfoMode::FAST)) + { + const char * arena_begin = nullptr; + WriteBufferFromArena out(arena, arena_begin); + + writeString(location.file.toString(), out); + writeChar(':', out); + writeIntText(location.line, out); + + StringRef out_str = out.finish(); + out_str.data = arena.insert(out_str.data, out_str.size); + return out_str; + } + else + { + return object->name; + } + } + else + return {}; + } + + StringRef implCached(uintptr_t addr) + { + Map::iterator it; + bool inserted; + std::lock_guard lock(mutex); + map.emplace(addr, it, inserted); + if (inserted) + it->getSecond() = impl(addr); + return it->getSecond(); + } +}; + +void registerFunctionAddressToLine(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/dbms/src/Functions/registerFunctionsIntrospection.cpp b/dbms/src/Functions/registerFunctionsIntrospection.cpp index 0797d21c36..79cd76f4ad 100644 --- a/dbms/src/Functions/registerFunctionsIntrospection.cpp +++ b/dbms/src/Functions/registerFunctionsIntrospection.cpp @@ -5,11 +5,13 @@ class FunctionFactory; void registerFunctionSymbolizeAddress(FunctionFactory & factory); void registerFunctionDemangle(FunctionFactory & factory); +void registerFunctionAddressToLine(FunctionFactory & factory); void registerFunctionsIntrospection(FunctionFactory & factory) { registerFunctionSymbolizeAddress(factory); registerFunctionDemangle(factory); + registerFunctionAddressToLine(factory); } } From efbbb149727ead3efe31866d311108438e4747d9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 02:49:00 +0300 Subject: [PATCH 13/21] Renamed function symbolizeAddress to addressToSymbol --- .../{symbolizeAddress.cpp => addressToSymbol.cpp} | 10 +++++----- dbms/src/Functions/registerFunctionsIntrospection.cpp | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) rename dbms/src/Functions/{symbolizeAddress.cpp => addressToSymbol.cpp} (89%) diff --git a/dbms/src/Functions/symbolizeAddress.cpp b/dbms/src/Functions/addressToSymbol.cpp similarity index 89% rename from dbms/src/Functions/symbolizeAddress.cpp rename to dbms/src/Functions/addressToSymbol.cpp index 454fc94b7b..915327c6c1 100644 --- a/dbms/src/Functions/symbolizeAddress.cpp +++ b/dbms/src/Functions/addressToSymbol.cpp @@ -18,13 +18,13 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -class FunctionSymbolizeAddress : public IFunction +class FunctionAddressToSymbol : public IFunction { public: - static constexpr auto name = "symbolizeAddress"; + static constexpr auto name = "addressToSymbol"; static FunctionPtr create(const Context &) { - return std::make_shared(); + return std::make_shared(); } String getName() const override @@ -82,9 +82,9 @@ public: } }; -void registerFunctionSymbolizeAddress(FunctionFactory & factory) +void registerFunctionAddressToSymbol(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/dbms/src/Functions/registerFunctionsIntrospection.cpp b/dbms/src/Functions/registerFunctionsIntrospection.cpp index 79cd76f4ad..448400b37a 100644 --- a/dbms/src/Functions/registerFunctionsIntrospection.cpp +++ b/dbms/src/Functions/registerFunctionsIntrospection.cpp @@ -3,13 +3,13 @@ namespace DB class FunctionFactory; -void registerFunctionSymbolizeAddress(FunctionFactory & factory); +void registerFunctionAddressToSymbol(FunctionFactory & factory); void registerFunctionDemangle(FunctionFactory & factory); void registerFunctionAddressToLine(FunctionFactory & factory); void registerFunctionsIntrospection(FunctionFactory & factory) { - registerFunctionSymbolizeAddress(factory); + registerFunctionAddressToSymbol(factory); registerFunctionDemangle(factory); registerFunctionAddressToLine(factory); } From 20ae0ee80eb78f0696ea4015bab34b0323e194c9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 02:54:49 +0300 Subject: [PATCH 14/21] Added a flag to disable introspection functions --- dbms/src/Core/Settings.h | 1 + dbms/src/Functions/addressToLine.cpp | 7 ++++++- dbms/src/Functions/addressToSymbol.cpp | 6 +++++- dbms/src/Functions/demange.cpp | 7 ++++++- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 60f2599c73..0af0bf02d2 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -334,6 +334,7 @@ struct Settings : public SettingsCollection \ M(SettingBool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.") \ M(SettingBool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.") \ + M(SettingBool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.") \ \ M(SettingUInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.") \ M(SettingBool, check_query_single_value_result, true, "Return check query result as single 1/0 value") \ diff --git a/dbms/src/Functions/addressToLine.cpp b/dbms/src/Functions/addressToLine.cpp index 0e3c4fe65a..7f7bd609de 100644 --- a/dbms/src/Functions/addressToLine.cpp +++ b/dbms/src/Functions/addressToLine.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -25,14 +26,18 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int FUNCTION_NOT_ALLOWED; } class FunctionAddressToLine : public IFunction { public: static constexpr auto name = "addressToLine"; - static FunctionPtr create(const Context &) + static FunctionPtr create(const Context & context) { + if (!context.getSettingsRef().allow_introspection_functions) + throw Exception("Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + return std::make_shared(); } diff --git a/dbms/src/Functions/addressToSymbol.cpp b/dbms/src/Functions/addressToSymbol.cpp index 915327c6c1..ceb641e457 100644 --- a/dbms/src/Functions/addressToSymbol.cpp +++ b/dbms/src/Functions/addressToSymbol.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -16,14 +17,17 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int FUNCTION_NOT_ALLOWED; } class FunctionAddressToSymbol : public IFunction { public: static constexpr auto name = "addressToSymbol"; - static FunctionPtr create(const Context &) + static FunctionPtr create(const Context & context) { + if (!context.getSettingsRef().allow_introspection_functions) + throw Exception("Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); return std::make_shared(); } diff --git a/dbms/src/Functions/demange.cpp b/dbms/src/Functions/demange.cpp index aea14f5bc7..a94b99f62e 100644 --- a/dbms/src/Functions/demange.cpp +++ b/dbms/src/Functions/demange.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -16,14 +17,18 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int FUNCTION_NOT_ALLOWED; } class FunctionDemangle : public IFunction { public: static constexpr auto name = "demangle"; - static FunctionPtr create(const Context &) + static FunctionPtr create(const Context & context) { + if (!context.getSettingsRef().allow_introspection_functions) + throw Exception("Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); + return std::make_shared(); } From fec6ede519e1e2d90bfaa1d1cfde988a4918da23 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 03:15:01 +0300 Subject: [PATCH 15/21] Fixed glibc-compatibility --- libs/libglibc-compatibility/musl/utimensat.c | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 libs/libglibc-compatibility/musl/utimensat.c diff --git a/libs/libglibc-compatibility/musl/utimensat.c b/libs/libglibc-compatibility/musl/utimensat.c new file mode 100644 index 0000000000..dce0a3c270 --- /dev/null +++ b/libs/libglibc-compatibility/musl/utimensat.c @@ -0,0 +1,38 @@ +#include +#include +#include +#include +#include "syscall.h" +#include + +int utimensat(int fd, const char *path, const struct timespec times[2], int flags) +{ + int r = __syscall(SYS_utimensat, fd, path, times, flags); +#ifdef SYS_futimesat + if (r != -ENOSYS || flags) return __syscall_ret(r); + struct timeval *tv = 0, tmp[2]; + if (times) { + int i; + tv = tmp; + for (i=0; i<2; i++) { + if (times[i].tv_nsec >= 1000000000ULL) { + if (times[i].tv_nsec == UTIME_NOW && + times[1-i].tv_nsec == UTIME_NOW) { + tv = 0; + break; + } + if (times[i].tv_nsec == UTIME_OMIT) + return __syscall_ret(-ENOSYS); + return __syscall_ret(-EINVAL); + } + tmp[i].tv_sec = times[i].tv_sec; + tmp[i].tv_usec = times[i].tv_nsec / 1000; + } + } + + r = __syscall(SYS_futimesat, fd, path, tv); + if (r != -ENOSYS || fd != AT_FDCWD) return __syscall_ret(r); + r = __syscall(SYS_utimes, path, tv); +#endif + return __syscall_ret(r); +} From c0118bda75fcca38bb6d03b7b09af8342e3c4dd0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 18:19:57 +0300 Subject: [PATCH 16/21] Fixed test --- dbms/tests/queries/0_stateless/00974_query_profiler.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00974_query_profiler.sql b/dbms/tests/queries/0_stateless/00974_query_profiler.sql index b3d70bc6ac..da88e904f3 100644 --- a/dbms/tests/queries/0_stateless/00974_query_profiler.sql +++ b/dbms/tests/queries/0_stateless/00974_query_profiler.sql @@ -3,7 +3,7 @@ SET log_queries = 1; SELECT sleep(0.5), ignore('test real time query profiler'); SET log_queries = 0; SYSTEM FLUSH LOGS; -WITH symbolizeAddress(arrayJoin(trace)) AS symbol SELECT count() > 0 FROM system.trace_log t WHERE event_date >= yesterday() AND query_id = (SELECT query_id FROM system.query_log WHERE event_date >= yesterday() AND query LIKE '%test real time query profiler%' ORDER BY event_time DESC LIMIT 1) AND symbol LIKE '%FunctionSleep%'; +WITH addressToSymbol(arrayJoin(trace)) AS symbol SELECT count() > 0 FROM system.trace_log t WHERE event_date >= yesterday() AND query_id = (SELECT query_id FROM system.query_log WHERE event_date >= yesterday() AND query LIKE '%test real time query profiler%' ORDER BY event_time DESC LIMIT 1) AND symbol LIKE '%FunctionSleep%'; SET query_profiler_real_time_period_ns = 0; SET query_profiler_cpu_time_period_ns = 100000000; @@ -11,4 +11,4 @@ SET log_queries = 1; SELECT count(), ignore('test cpu time query profiler') FROM numbers(1000000000); SET log_queries = 0; SYSTEM FLUSH LOGS; -WITH symbolizeAddress(arrayJoin(trace)) AS symbol SELECT count() > 0 FROM system.trace_log t WHERE event_date >= yesterday() AND query_id = (SELECT query_id FROM system.query_log WHERE event_date >= yesterday() AND query LIKE '%test cpu time query profiler%' ORDER BY event_time DESC LIMIT 1) AND symbol LIKE '%Numbers%'; +WITH addressToSymbol(arrayJoin(trace)) AS symbol SELECT count() > 0 FROM system.trace_log t WHERE event_date >= yesterday() AND query_id = (SELECT query_id FROM system.query_log WHERE event_date >= yesterday() AND query LIKE '%test cpu time query profiler%' ORDER BY event_time DESC LIMIT 1) AND symbol LIKE '%Numbers%'; From 1d289b5b4986f6f728e9e75c51e29483786eb61b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 18:22:41 +0300 Subject: [PATCH 17/21] Fixed "splitted" build --- dbms/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 99db33de3f..3a184a8297 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -162,6 +162,10 @@ endif () if (USE_UNWIND) target_compile_definitions (clickhouse_common_io PRIVATE USE_UNWIND=1) target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${UNWIND_INCLUDE_DIR}) + + if (NOT USE_INTERNAL_UNWIND_LIBRARY_FOR_EXCEPTION_HANDLING) + target_link_libraries (common PRIVATE ${UNWIND_LIBRARY}) + endif () endif () add_subdirectory(src/Common/ZooKeeper) From 72e0fbd8615134747e6860b0952038d7cf5018e2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 19:12:53 +0300 Subject: [PATCH 18/21] Added support for splitted debug info; advancements --- dbms/src/Common/SymbolIndex.cpp | 29 +++++++++++++++++++------- dbms/src/Common/tests/symbol_index.cpp | 10 ++++++--- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/dbms/src/Common/SymbolIndex.cpp b/dbms/src/Common/SymbolIndex.cpp index ff04ea35ea..25edb2350f 100644 --- a/dbms/src/Common/SymbolIndex.cpp +++ b/dbms/src/Common/SymbolIndex.cpp @@ -16,7 +16,10 @@ namespace { /// Notes: "PHDR" is "Program Headers". -/// To look at program headers, you can run: objdump -p ./clickhouse-server +/// To look at program headers, run: +/// readelf -l ./clickhouse-server +/// To look at section headers, run: +/// readelf -S ./clickhouse-server /// Also look at: https://wiki.osdev.org/ELF /// Also look at: man elf /// http://www.linker-aliens.org/blogs/ali/entry/inside_elf_symbol_tables/ @@ -25,11 +28,14 @@ namespace /// Based on the code of musl-libc and the answer of Kanalpiroge on /// https://stackoverflow.com/questions/15779185/list-all-the-functions-symbols-on-the-fly-in-c-code-on-a-linux-architecture +/// It does not extract all the symbols (but only public - exported and used for dynamic linking), +/// but will work if we cannot find or parse ELF files. void collectSymbolsFromProgramHeaders(dl_phdr_info * info, std::vector & symbols) { /* Iterate over all headers of the current shared lib - * (first call is for the executable itself) */ + * (first call is for the executable itself) + */ for (size_t header_index = 0; header_index < info->dlpi_phnum; ++header_index) { /* Further processing is only needed if the dynamic section is reached @@ -223,11 +229,16 @@ void collectSymbolsFromELF(dl_phdr_info * info, object_name = "/proc/self/exe"; std::error_code ec; - object_name = std::filesystem::canonical(object_name, ec); + std::filesystem::path canonical_path = std::filesystem::canonical(object_name, ec); if (ec) return; + /// Debug info and symbol table sections may be splitted to separate binary. + std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path; + + object_name = std::filesystem::exists(debug_info_path) ? debug_info_path : canonical_path; + SymbolIndex::Object object; object.elf = std::make_unique(object_name); object.address_begin = reinterpret_cast(info->dlpi_addr); @@ -248,10 +259,6 @@ void collectSymbolsFromELF(dl_phdr_info * info, */ int collectSymbols(dl_phdr_info * info, size_t, void * data_ptr) { - /* ElfW is a macro that creates proper typenames for the used system architecture - * (e.g. on a 32 bit system, ElfW(Dyn*) becomes "Elf32_Dyn*") - */ - SymbolIndex::Data & data = *reinterpret_cast(data_ptr); collectSymbolsFromProgramHeaders(info, data.symbols); @@ -287,7 +294,15 @@ const T * find(const void * address, const std::vector & vec) void SymbolIndex::update() { dl_iterate_phdr(collectSymbols, &data.symbols); + + std::sort(data.objects.begin(), data.objects.end(), [](const Object & a, const Object & b) { return a.address_begin < b.address_begin; }); std::sort(data.symbols.begin(), data.symbols.end(), [](const Symbol & a, const Symbol & b) { return a.address_begin < b.address_begin; }); + + /// We found symbols both from loaded program headers and from ELF symbol tables. + data.symbols.erase(std::unique(data.symbols.begin(), data.symbols.end(), [](const Symbol & a, const Symbol & b) + { + return a.address_begin == b.address_begin && a.address_end == b.address_end; + }), data.symbols.end()); } const SymbolIndex::Symbol * SymbolIndex::findSymbol(const void * address) const diff --git a/dbms/src/Common/tests/symbol_index.cpp b/dbms/src/Common/tests/symbol_index.cpp index c6a22b1266..3e2e73f3c9 100644 --- a/dbms/src/Common/tests/symbol_index.cpp +++ b/dbms/src/Common/tests/symbol_index.cpp @@ -1,12 +1,16 @@ #include #include #include +#include #include #include #include -void f() {} +NO_INLINE const void * getAddress() +{ + return __builtin_return_address(0); +} using namespace DB; @@ -37,8 +41,8 @@ int main(int argc, char ** argv) else std::cerr << "dladdr: Not found\n"; - Elf elf("/proc/self/exe"); - Dwarf dwarf(elf); + auto object = symbol_index.findObject(getAddress()); + Dwarf dwarf(*object->elf); Dwarf::LocationInfo location; if (dwarf.findAddress(uintptr_t(address), location, Dwarf::LocationInfoMode::FAST)) From a9b079c7bac82159674c0705731069cfce9fd81c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 19:18:06 +0300 Subject: [PATCH 19/21] Minor modifications --- dbms/src/Common/tests/symbol_index.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbms/src/Common/tests/symbol_index.cpp b/dbms/src/Common/tests/symbol_index.cpp index 3e2e73f3c9..6c0d303fe3 100644 --- a/dbms/src/Common/tests/symbol_index.cpp +++ b/dbms/src/Common/tests/symbol_index.cpp @@ -26,6 +26,7 @@ int main(int argc, char ** argv) for (const auto & elem : symbol_index.symbols()) std::cout << elem.name << ": " << elem.address_begin << " ... " << elem.address_end << "\n"; + std::cout << "\n"; const void * address = reinterpret_cast(std::stoull(argv[1], nullptr, 16)); @@ -50,6 +51,7 @@ int main(int argc, char ** argv) else std::cerr << "Dwarf: Not found\n"; + std::cerr << "\n"; std::cerr << StackTrace().toString() << "\n"; return 0; From 8292f3dbc963b48567247e14ccfd26b8ffb24f6c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 21:01:59 +0300 Subject: [PATCH 20/21] Fixed test --- dbms/tests/queries/0_stateless/00974_query_profiler.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbms/tests/queries/0_stateless/00974_query_profiler.sql b/dbms/tests/queries/0_stateless/00974_query_profiler.sql index da88e904f3..d77e656478 100644 --- a/dbms/tests/queries/0_stateless/00974_query_profiler.sql +++ b/dbms/tests/queries/0_stateless/00974_query_profiler.sql @@ -1,3 +1,5 @@ +SET allow_introspection_functions = 1; + SET query_profiler_real_time_period_ns = 100000000; SET log_queries = 1; SELECT sleep(0.5), ignore('test real time query profiler'); From d95e0e66c6f4c87a1a76e6855884d1bd39aeb24f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jul 2019 21:03:12 +0300 Subject: [PATCH 21/21] Fixed "splitted" build --- dbms/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 3a184a8297..21bfc5698f 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -164,7 +164,7 @@ if (USE_UNWIND) target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${UNWIND_INCLUDE_DIR}) if (NOT USE_INTERNAL_UNWIND_LIBRARY_FOR_EXCEPTION_HANDLING) - target_link_libraries (common PRIVATE ${UNWIND_LIBRARY}) + target_link_libraries (clickhouse_common_io PRIVATE ${UNWIND_LIBRARY}) endif () endif ()