master
  1//===-- sanitizer_symbolizer_libcdep.cpp ----------------------------------===//
  2//
  3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4// See https://llvm.org/LICENSE.txt for license information.
  5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6//
  7//===----------------------------------------------------------------------===//
  8//
  9// This file is shared between AddressSanitizer and ThreadSanitizer
 10// run-time libraries.
 11//===----------------------------------------------------------------------===//
 12
 13#include "sanitizer_allocator_internal.h"
 14#include "sanitizer_internal_defs.h"
 15#include "sanitizer_platform.h"
 16#include "sanitizer_symbolizer_internal.h"
 17
 18namespace __sanitizer {
 19
 20Symbolizer *Symbolizer::GetOrInit() {
 21  SpinMutexLock l(&init_mu_);
 22  if (symbolizer_)
 23    return symbolizer_;
 24  symbolizer_ = PlatformInit();
 25  CHECK(symbolizer_);
 26  return symbolizer_;
 27}
 28
 29// See sanitizer_symbolizer_markup.cpp.
 30#if !SANITIZER_SYMBOLIZER_MARKUP
 31
 32const char *ExtractToken(const char *str, const char *delims, char **result) {
 33  uptr prefix_len = internal_strcspn(str, delims);
 34  *result = (char *)InternalAlloc(prefix_len + 1);
 35  internal_memcpy(*result, str, prefix_len);
 36  (*result)[prefix_len] = '\0';
 37  const char *prefix_end = str + prefix_len;
 38  if (*prefix_end != '\0')
 39    prefix_end++;
 40  return prefix_end;
 41}
 42
 43const char *ExtractInt(const char *str, const char *delims, int *result) {
 44  char *buff = nullptr;
 45  const char *ret = ExtractToken(str, delims, &buff);
 46  if (buff) {
 47    *result = (int)internal_atoll(buff);
 48  }
 49  InternalFree(buff);
 50  return ret;
 51}
 52
 53const char *ExtractUptr(const char *str, const char *delims, uptr *result) {
 54  char *buff = nullptr;
 55  const char *ret = ExtractToken(str, delims, &buff);
 56  if (buff) {
 57    *result = (uptr)internal_atoll(buff);
 58  }
 59  InternalFree(buff);
 60  return ret;
 61}
 62
 63const char *ExtractSptr(const char *str, const char *delims, sptr *result) {
 64  char *buff = nullptr;
 65  const char *ret = ExtractToken(str, delims, &buff);
 66  if (buff) {
 67    *result = (sptr)internal_atoll(buff);
 68  }
 69  InternalFree(buff);
 70  return ret;
 71}
 72
 73const char *ExtractTokenUpToDelimiter(const char *str, const char *delimiter,
 74                                      char **result) {
 75  const char *found_delimiter = internal_strstr(str, delimiter);
 76  uptr prefix_len =
 77      found_delimiter ? found_delimiter - str : internal_strlen(str);
 78  *result = (char *)InternalAlloc(prefix_len + 1);
 79  internal_memcpy(*result, str, prefix_len);
 80  (*result)[prefix_len] = '\0';
 81  const char *prefix_end = str + prefix_len;
 82  if (*prefix_end != '\0')
 83    prefix_end += internal_strlen(delimiter);
 84  return prefix_end;
 85}
 86
 87SymbolizedStack *Symbolizer::SymbolizePC(uptr addr) {
 88  Lock l(&mu_);
 89  SymbolizedStack *res = SymbolizedStack::New(addr);
 90  auto *mod = FindModuleForAddress(addr);
 91  if (!mod)
 92    return res;
 93  // Always fill data about module name and offset.
 94  res->info.FillModuleInfo(*mod);
 95  for (auto &tool : tools_) {
 96    SymbolizerScope sym_scope(this);
 97    if (tool.SymbolizePC(addr, res)) {
 98      return res;
 99    }
100  }
101  return res;
102}
103
104bool Symbolizer::SymbolizeData(uptr addr, DataInfo *info) {
105  Lock l(&mu_);
106  const char *module_name = nullptr;
107  uptr module_offset;
108  ModuleArch arch;
109  if (!FindModuleNameAndOffsetForAddress(addr, &module_name, &module_offset,
110                                         &arch))
111    return false;
112  info->Clear();
113  info->module = internal_strdup(module_name);
114  info->module_offset = module_offset;
115  info->module_arch = arch;
116  for (auto &tool : tools_) {
117    SymbolizerScope sym_scope(this);
118    if (tool.SymbolizeData(addr, info)) {
119      return true;
120    }
121  }
122  return false;
123}
124
125bool Symbolizer::SymbolizeFrame(uptr addr, FrameInfo *info) {
126  Lock l(&mu_);
127  const char *module_name = nullptr;
128  if (!FindModuleNameAndOffsetForAddress(
129          addr, &module_name, &info->module_offset, &info->module_arch))
130    return false;
131  info->module = internal_strdup(module_name);
132  for (auto &tool : tools_) {
133    SymbolizerScope sym_scope(this);
134    if (tool.SymbolizeFrame(addr, info)) {
135      return true;
136    }
137  }
138  return false;
139}
140
141bool Symbolizer::GetModuleNameAndOffsetForPC(uptr pc, const char **module_name,
142                                             uptr *module_address) {
143  Lock l(&mu_);
144  const char *internal_module_name = nullptr;
145  ModuleArch arch;
146  if (!FindModuleNameAndOffsetForAddress(pc, &internal_module_name,
147                                         module_address, &arch))
148    return false;
149
150  if (module_name)
151    *module_name = module_names_.GetOwnedCopy(internal_module_name);
152  return true;
153}
154
155void Symbolizer::Flush() {
156  Lock l(&mu_);
157  for (auto &tool : tools_) {
158    SymbolizerScope sym_scope(this);
159    tool.Flush();
160  }
161}
162
163const char *Symbolizer::Demangle(const char *name) {
164  CHECK(name);
165  Lock l(&mu_);
166  for (auto &tool : tools_) {
167    SymbolizerScope sym_scope(this);
168    if (const char *demangled = tool.Demangle(name))
169      return demangled;
170  }
171  if (const char *demangled = PlatformDemangle(name))
172    return demangled;
173  return name;
174}
175
176bool Symbolizer::FindModuleNameAndOffsetForAddress(uptr address,
177                                                   const char **module_name,
178                                                   uptr *module_offset,
179                                                   ModuleArch *module_arch) {
180  const LoadedModule *module = FindModuleForAddress(address);
181  if (!module)
182    return false;
183  *module_name = module->full_name();
184  *module_offset = address - module->base_address();
185  *module_arch = module->arch();
186  return true;
187}
188
189void Symbolizer::RefreshModules() {
190  modules_.init();
191  fallback_modules_.fallbackInit();
192  RAW_CHECK(modules_.size() > 0);
193  modules_fresh_ = true;
194}
195
196const ListOfModules &Symbolizer::GetRefreshedListOfModules() {
197  if (!modules_fresh_)
198    RefreshModules();
199
200  return modules_;
201}
202
203static const LoadedModule *SearchForModule(const ListOfModules &modules,
204                                           uptr address) {
205  for (uptr i = 0; i < modules.size(); i++) {
206    if (modules[i].containsAddress(address)) {
207      return &modules[i];
208    }
209  }
210  return nullptr;
211}
212
213const LoadedModule *Symbolizer::FindModuleForAddress(uptr address) {
214  bool modules_were_reloaded = false;
215  if (!modules_fresh_) {
216    RefreshModules();
217    modules_were_reloaded = true;
218  }
219  const LoadedModule *module = SearchForModule(modules_, address);
220  if (module)
221    return module;
222
223  // dlopen/dlclose interceptors invalidate the module list, but when
224  // interception is disabled, we need to retry if the lookup fails in
225  // case the module list changed.
226#  if !SANITIZER_INTERCEPT_DLOPEN_DLCLOSE
227  if (!modules_were_reloaded) {
228    RefreshModules();
229    module = SearchForModule(modules_, address);
230    if (module)
231      return module;
232  }
233#  endif
234
235  if (fallback_modules_.size()) {
236    module = SearchForModule(fallback_modules_, address);
237  }
238  return module;
239}
240
241// For now we assume the following protocol:
242// For each request of the form
243//   <module_name> <module_offset>
244// passed to STDIN, external symbolizer prints to STDOUT response:
245//   <function_name>
246//   <file_name>:<line_number>:<column_number>
247//   <function_name>
248//   <file_name>:<line_number>:<column_number>
249//   ...
250//   <empty line>
251class LLVMSymbolizerProcess final : public SymbolizerProcess {
252 public:
253  explicit LLVMSymbolizerProcess(const char *path)
254      : SymbolizerProcess(path, /*use_posix_spawn=*/SANITIZER_APPLE) {}
255
256 private:
257  bool ReachedEndOfOutput(const char *buffer, uptr length) const override {
258    // Empty line marks the end of llvm-symbolizer output.
259    return length >= 2 && buffer[length - 1] == '\n' &&
260           buffer[length - 2] == '\n';
261  }
262
263  // When adding a new architecture, don't forget to also update
264  // script/asan_symbolize.py and sanitizer_common.h.
265  void GetArgV(const char *path_to_binary,
266               const char *(&argv)[kArgVMax]) const override {
267#  if defined(__x86_64h__)
268    const char *const kSymbolizerArch = "--default-arch=x86_64h";
269#  elif defined(__x86_64__)
270    const char *const kSymbolizerArch = "--default-arch=x86_64";
271#  elif defined(__i386__)
272    const char *const kSymbolizerArch = "--default-arch=i386";
273#  elif SANITIZER_LOONGARCH64
274    const char *const kSymbolizerArch = "--default-arch=loongarch64";
275#  elif SANITIZER_RISCV64
276    const char *const kSymbolizerArch = "--default-arch=riscv64";
277#  elif defined(__aarch64__)
278    const char *const kSymbolizerArch = "--default-arch=arm64";
279#  elif defined(__arm__)
280    const char *const kSymbolizerArch = "--default-arch=arm";
281#  elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
282    const char *const kSymbolizerArch = "--default-arch=powerpc64";
283#  elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
284    const char *const kSymbolizerArch = "--default-arch=powerpc64le";
285#  elif defined(__s390x__)
286    const char *const kSymbolizerArch = "--default-arch=s390x";
287#  elif defined(__s390__)
288    const char *const kSymbolizerArch = "--default-arch=s390";
289#  else
290    const char *const kSymbolizerArch = "--default-arch=unknown";
291#  endif
292
293    const char *const demangle_flag =
294        common_flags()->demangle ? "--demangle" : "--no-demangle";
295    const char *const inline_flag =
296        common_flags()->symbolize_inline_frames ? "--inlines" : "--no-inlines";
297    int i = 0;
298    argv[i++] = path_to_binary;
299    argv[i++] = demangle_flag;
300    argv[i++] = inline_flag;
301    argv[i++] = kSymbolizerArch;
302    argv[i++] = nullptr;
303    CHECK_LE(i, kArgVMax);
304  }
305};
306
307LLVMSymbolizer::LLVMSymbolizer(const char *path, LowLevelAllocator *allocator)
308    : symbolizer_process_(new(*allocator) LLVMSymbolizerProcess(path)) {}
309
310// Parse a <file>:<line>[:<column>] buffer. The file path may contain colons on
311// Windows, so extract tokens from the right hand side first. The column info is
312// also optional.
313static const char *ParseFileLineInfo(AddressInfo *info, const char *str) {
314  char *file_line_info = nullptr;
315  str = ExtractToken(str, "\n", &file_line_info);
316  CHECK(file_line_info);
317
318  if (uptr size = internal_strlen(file_line_info)) {
319    char *back = file_line_info + size - 1;
320    for (int i = 0; i < 2; ++i) {
321      while (back > file_line_info && IsDigit(*back)) --back;
322      if (*back != ':' || !IsDigit(back[1]))
323        break;
324      info->column = info->line;
325      info->line = internal_atoll(back + 1);
326      // Truncate the string at the colon to keep only filename.
327      *back = '\0';
328      --back;
329    }
330    ExtractToken(file_line_info, "", &info->file);
331  }
332
333  InternalFree(file_line_info);
334  return str;
335}
336
337// Parses one or more two-line strings in the following format:
338//   <function_name>
339//   <file_name>:<line_number>[:<column_number>]
340// Used by LLVMSymbolizer, Addr2LinePool and InternalSymbolizer, since all of
341// them use the same output format.
342void ParseSymbolizePCOutput(const char *str, SymbolizedStack *res) {
343  bool top_frame = true;
344  SymbolizedStack *last = res;
345  while (true) {
346    char *function_name = nullptr;
347    str = ExtractToken(str, "\n", &function_name);
348    CHECK(function_name);
349    if (function_name[0] == '\0') {
350      // There are no more frames.
351      InternalFree(function_name);
352      break;
353    }
354    SymbolizedStack *cur;
355    if (top_frame) {
356      cur = res;
357      top_frame = false;
358    } else {
359      cur = SymbolizedStack::New(res->info.address);
360      cur->info.FillModuleInfo(res->info.module, res->info.module_offset,
361                               res->info.module_arch);
362      last->next = cur;
363      last = cur;
364    }
365
366    AddressInfo *info = &cur->info;
367    info->function = function_name;
368    str = ParseFileLineInfo(info, str);
369
370    // Functions and filenames can be "??", in which case we write 0
371    // to address info to mark that names are unknown.
372    if (0 == internal_strcmp(info->function, "??")) {
373      InternalFree(info->function);
374      info->function = 0;
375    }
376    if (info->file && 0 == internal_strcmp(info->file, "??")) {
377      InternalFree(info->file);
378      info->file = 0;
379    }
380  }
381}
382
383// Parses a two- or three-line string in the following format:
384//   <symbol_name>
385//   <start_address> <size>
386//   <filename>:<column>
387// Used by LLVMSymbolizer and InternalSymbolizer. LLVMSymbolizer added support
388// for symbolizing the third line in D123538, but we support the older two-line
389// information as well.
390void ParseSymbolizeDataOutput(const char *str, DataInfo *info) {
391  str = ExtractToken(str, "\n", &info->name);
392  str = ExtractUptr(str, " ", &info->start);
393  str = ExtractUptr(str, "\n", &info->size);
394  // Note: If the third line isn't present, these calls will set info.{file,
395  // line} to empty strings.
396  str = ExtractToken(str, ":", &info->file);
397  str = ExtractUptr(str, "\n", &info->line);
398}
399
400void ParseSymbolizeFrameOutput(const char *str,
401                               InternalMmapVector<LocalInfo> *locals) {
402  if (internal_strncmp(str, "??", 2) == 0)
403    return;
404
405  while (*str) {
406    LocalInfo local;
407    str = ExtractToken(str, "\n", &local.function_name);
408    str = ExtractToken(str, "\n", &local.name);
409
410    AddressInfo addr;
411    str = ParseFileLineInfo(&addr, str);
412    local.decl_file = addr.file;
413    local.decl_line = addr.line;
414
415    local.has_frame_offset = internal_strncmp(str, "??", 2) != 0;
416    str = ExtractSptr(str, " ", &local.frame_offset);
417
418    local.has_size = internal_strncmp(str, "??", 2) != 0;
419    str = ExtractUptr(str, " ", &local.size);
420
421    local.has_tag_offset = internal_strncmp(str, "??", 2) != 0;
422    str = ExtractUptr(str, "\n", &local.tag_offset);
423
424    locals->push_back(local);
425  }
426}
427
428bool LLVMSymbolizer::SymbolizePC(uptr addr, SymbolizedStack *stack) {
429  AddressInfo *info = &stack->info;
430  const char *buf = FormatAndSendCommand(
431      "CODE", info->module, info->module_offset, info->module_arch);
432  if (!buf)
433    return false;
434  ParseSymbolizePCOutput(buf, stack);
435  return true;
436}
437
438bool LLVMSymbolizer::SymbolizeData(uptr addr, DataInfo *info) {
439  const char *buf = FormatAndSendCommand(
440      "DATA", info->module, info->module_offset, info->module_arch);
441  if (!buf)
442    return false;
443  ParseSymbolizeDataOutput(buf, info);
444  info->start += (addr - info->module_offset);  // Add the base address.
445  return true;
446}
447
448bool LLVMSymbolizer::SymbolizeFrame(uptr addr, FrameInfo *info) {
449  const char *buf = FormatAndSendCommand(
450      "FRAME", info->module, info->module_offset, info->module_arch);
451  if (!buf)
452    return false;
453  ParseSymbolizeFrameOutput(buf, &info->locals);
454  return true;
455}
456
457const char *LLVMSymbolizer::FormatAndSendCommand(const char *command_prefix,
458                                                 const char *module_name,
459                                                 uptr module_offset,
460                                                 ModuleArch arch) {
461  CHECK(module_name);
462  int size_needed = 0;
463  if (arch == kModuleArchUnknown)
464    size_needed = internal_snprintf(buffer_, kBufferSize, "%s \"%s\" 0x%zx\n",
465                                    command_prefix, module_name, module_offset);
466  else
467    size_needed = internal_snprintf(
468        buffer_, kBufferSize, "%s \"%s:%s\" 0x%zx\n", command_prefix,
469        module_name, ModuleArchToString(arch), module_offset);
470
471  if (size_needed >= static_cast<int>(kBufferSize)) {
472    Report("WARNING: Command buffer too small");
473    return nullptr;
474  }
475
476  return symbolizer_process_->SendCommand(buffer_);
477}
478
479SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
480    : path_(path),
481      input_fd_(kInvalidFd),
482      output_fd_(kInvalidFd),
483      times_restarted_(0),
484      failed_to_start_(false),
485      reported_invalid_path_(false),
486      use_posix_spawn_(use_posix_spawn) {
487  CHECK(path_);
488  CHECK_NE(path_[0], '\0');
489}
490
491static bool IsSameModule(const char *path) {
492  if (const char *ProcessName = GetProcessName()) {
493    if (const char *SymbolizerName = StripModuleName(path)) {
494      return !internal_strcmp(ProcessName, SymbolizerName);
495    }
496  }
497  return false;
498}
499
500const char *SymbolizerProcess::SendCommand(const char *command) {
501  if (failed_to_start_)
502    return nullptr;
503  if (IsSameModule(path_)) {
504    Report("WARNING: Symbolizer was blocked from starting itself!\n");
505    failed_to_start_ = true;
506    return nullptr;
507  }
508  for (; times_restarted_ < kMaxTimesRestarted; times_restarted_++) {
509    // Start or restart symbolizer if we failed to send command to it.
510    if (const char *res = SendCommandImpl(command))
511      return res;
512    Restart();
513  }
514  if (!failed_to_start_) {
515    Report("WARNING: Failed to use and restart external symbolizer!\n");
516    failed_to_start_ = true;
517  }
518  return nullptr;
519}
520
521const char *SymbolizerProcess::SendCommandImpl(const char *command) {
522  if (input_fd_ == kInvalidFd || output_fd_ == kInvalidFd)
523    return nullptr;
524  if (!WriteToSymbolizer(command, internal_strlen(command)))
525    return nullptr;
526  if (!ReadFromSymbolizer())
527    return nullptr;
528  return buffer_.data();
529}
530
531bool SymbolizerProcess::Restart() {
532  if (input_fd_ != kInvalidFd)
533    CloseFile(input_fd_);
534  if (output_fd_ != kInvalidFd)
535    CloseFile(output_fd_);
536  return StartSymbolizerSubprocess();
537}
538
539bool SymbolizerProcess::ReadFromSymbolizer() {
540  buffer_.clear();
541  constexpr uptr max_length = 1024;
542  bool ret = true;
543  do {
544    uptr just_read = 0;
545    uptr size_before = buffer_.size();
546    buffer_.resize(size_before + max_length);
547    buffer_.resize(buffer_.capacity());
548    bool ret = ReadFromFile(input_fd_, &buffer_[size_before],
549                            buffer_.size() - size_before, &just_read);
550
551    if (!ret)
552      just_read = 0;
553
554    buffer_.resize(size_before + just_read);
555
556    // We can't read 0 bytes, as we don't expect external symbolizer to close
557    // its stdout.
558    if (just_read == 0) {
559      Report("WARNING: Can't read from symbolizer at fd %d\n", input_fd_);
560      ret = false;
561      break;
562    }
563  } while (!ReachedEndOfOutput(buffer_.data(), buffer_.size()));
564  buffer_.push_back('\0');
565  return ret;
566}
567
568bool SymbolizerProcess::WriteToSymbolizer(const char *buffer, uptr length) {
569  if (length == 0)
570    return true;
571  uptr write_len = 0;
572  bool success = WriteToFile(output_fd_, buffer, length, &write_len);
573  if (!success || write_len != length) {
574    Report("WARNING: Can't write to symbolizer at fd %d\n", output_fd_);
575    return false;
576  }
577  return true;
578}
579
580#endif  // !SANITIZER_SYMBOLIZER_MARKUP
581
582}  // namespace __sanitizer