// Copyright 2019 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_CCTEST_DISASM_REGEX_HELPER_H_ #define V8_CCTEST_DISASM_REGEX_HELPER_H_ #include <iostream> #include <map> #include <regex> // NOLINT(build/c++11) #include <vector> #include "src/base/logging.h" #include "src/base/macros.h" namespace v8 { namespace internal { // This class provides methods for regular expression matching with an extra // feature of user defined named capture groups which are alive across // regex search calls. // // The main use case for the class is to test multiple-line assembly // output with an ability to express dataflow or dependencies by allowing single // definition / multiple use symbols. When processing output lines and trying to // match them against the set of patterns a user can define a named group - a // symbol - and a regex for matching it. If the regex with the definitions is // matched then whenever this symbol appears again (no redefinitions though) in // the following patterns the parser will replace the symbol reference in the // pattern by an actual literal value matched during processing symbol // definition. This effectively checks that all of the output lines have // the same literal for the described symbol. To track the symbols this class // implements a simple single-definition symbol table. // // Example: Lets consider a case when we want to test that the assembly // output consists of two instructions - a load and a store; we also want // to check that the loaded value is used as store value for the store, // like here: // // ldr x3, [x4] // str x3, [x5] // // Using special syntax for symbol definitions and uses one could write the // following regex making sure that the load register is used by the store: // // 'ldr <<NamedReg:x[0-9]+>>, [x[0-9]+]' // 'str <<NamedReg>>, [x[0-9]+]' // // See 'ProcessPattern' for more details. class RegexParser { public: RegexParser() // Regex to parse symbol references: definitions or uses. // <<SymbolName[:'def regex']>> : symbol_ref_regex_("<<([a-zA-Z_][a-zA-Z0-9_]*)(?::(.*?))?>>") {} // Status codes used for return values and error diagnostics. enum class Status { kSuccess = 0, kNotMatched, kWrongPattern, kDefNotFound, kRedefinition, }; // This class holds info on a symbol definition. class SymbolInfo { public: explicit SymbolInfo(const std::string& matched_value) : matched_value_(matched_value) {} // Returns an actual matched value for the symbol. const std::string& matched_value() const { return matched_value_; } private: std::string matched_value_; }; // This class holds temporary info on a symbol while processing an input line. class SymbolVectorElem { public: SymbolVectorElem(bool is_def, const std::string& symbol_name) : is_def_(is_def), symbol_name_(symbol_name) {} bool is_def() const { return is_def_; } const std::string& symbol_name() const { return symbol_name_; } private: bool is_def_; std::string symbol_name_; }; using SymbolMap = std::map<std::string, SymbolInfo>; using MatchVector = std::vector<SymbolVectorElem>; // Tries to match (actually search, similar to std::regex_serach) the line // against the pattern (possibly containing symbols references) and if // matched commits symbols definitions from the pattern to the symbol table. // // Returns: status of the matching attempt. // // Important: the format of pattern regexs is based on std::ECMAScript syntax // (http://www.cplusplus.com/reference/regex/ECMAScript/) with a few extra // restrictions: // * no backreference (or submatch) groups // - when a group (e.g. "(a|b)+") is needed use a passive group // (e.g. "(?:a|b)+"). // * special syntax for symbol definitions: <<Name:regex>> // - 'Name' must be c-ctyle variable name ([a-zA-Z_][a-zA-Z0-9_]*). // - 'regex' - is a regex for the actual literal expected in the symbol // definition line. It must not contain any symbol references. // * special syntax for symbol uses <<Name>> // // Semantical restrictions on symbols references: // * symbols mustn't be referenced before they are defined. // - a pattern R1 which uses symbol 'A' mustn't be processed if a pattern // R2 with the symbol 'A' definition hasn't been yet matched (R1!=R2). // - A pattern mustn't define a symbol and use it inside the same regex. // * symbols mustn't be redefined. // - if a line has been matched against a pattern R1 with symbol 'A' // then other patterns mustn't define symbol 'A'. // * symbols defininitions are only committed and registered if the whole // pattern is successfully matched. // // Notes: // * A pattern may contain uses of the same or different symbols and // definitions of different symbols however if a symbol is defined in the // pattern it can't be used in the same pattern. // // Pattern example: "<<A:[0-9]+>> <<B>>, <<B> <<C:[a-z]+>>" (assuming 'B' is // defined and matched). Status ProcessPattern(const std::string& line, const std::string& pattern) { // Processed pattern which is going to be used for std::regex_search; symbol // references are replaced accordingly to the reference type - def or use. std::string final_pattern; // A vector of records for symbols references in the pattern. The format is // {is_definition, symbol_name}. MatchVector symbols_refs; Status status = ParseSymbolsInPattern(pattern, &final_pattern, &symbols_refs); if (status != Status::kSuccess) { return status; } std::smatch match; if (!std::regex_search(line, match, std::regex(final_pattern))) { return Status::kNotMatched; } // This checks that no backreference groups were used in the pattern except // for those added by ParseSymbolsInPattern. if (symbols_refs.size() != (match.size() - 1)) { return Status::kWrongPattern; } status = CheckSymbolsMatchedValues(symbols_refs, match); if (status != Status::kSuccess) { return status; } CommitSymbolsDefinitions(symbols_refs, match); return Status::kSuccess; } // Returns whether a symbol is defined in the symbol name. bool IsSymbolDefined(const std::string& symbol_name) const { auto symbol_map_iter = map_.find(symbol_name); return symbol_map_iter != std::end(map_); } // Returns the matched value for a symbol. std::string GetSymbolMatchedValue(const std::string& symbol_name) const { DCHECK(IsSymbolDefined(symbol_name)); return map_.find(symbol_name)->second.matched_value(); } // Prints the symbol table. void PrintSymbols(std::ostream& os) const { os << "Printing symbol table..." << std::endl; for (const auto& t : map_) { const std::string& sym_name = t.first; const SymbolInfo& sym_info = t.second; os << "<<" << sym_name << ">>: \"" << sym_info.matched_value() << "\"" << std::endl; } } protected: // Fixed layout for the symbol reference match. enum SymbolMatchIndex { kFullSubmatch = 0, kName = 1, kDefRegex = 2, kSize = kDefRegex + 1, }; // Processes a symbol reference: for definitions it adds the symbol regex, for // uses it adds actual literal from a previously matched definition. Also // fills the symbol references vector. Status ProcessSymbol(const std::smatch& match, MatchVector* symbols_refs, std::string* new_pattern) const { bool is_def = match[SymbolMatchIndex::kDefRegex].length() != 0; const std::string& symbol_name = match[SymbolMatchIndex::kName]; if (is_def) { // Make sure the symbol isn't already defined. auto symbol_iter = std::find_if(symbols_refs->begin(), symbols_refs->end(), [symbol_name](const SymbolVectorElem& ref) -> bool { return ref.symbol_name() == symbol_name; }); if (symbol_iter != std::end(*symbols_refs)) { return Status::kRedefinition; } symbols_refs->emplace_back(true, symbol_name); new_pattern->append("("); new_pattern->append(match[SymbolMatchIndex::kDefRegex]); new_pattern->append(")"); } else { auto symbol_map_iter = map_.find(symbol_name); if (symbol_map_iter == std::end(map_)) { return Status::kDefNotFound; } const SymbolInfo& sym_info = symbol_map_iter->second; new_pattern->append("("); new_pattern->append(sym_info.matched_value()); new_pattern->append(")"); symbols_refs->emplace_back(false, symbol_name); } return Status::kSuccess; } // Parses the input pattern regex, processes symbols defs and uses inside // it, fills a raw pattern used for std::regex_search. Status ParseSymbolsInPattern(const std::string& pattern, std::string* raw_pattern, MatchVector* symbols_refs) const { std::string::const_iterator low = pattern.cbegin(); std::string::const_iterator high = pattern.cend(); std::smatch match; while (low != high) { // Search for a symbol reference. if (!std::regex_search(low, high, match, symbol_ref_regex_)) { raw_pattern->append(low, high); break; } if (match.size() != SymbolMatchIndex::kSize) { return Status::kWrongPattern; } raw_pattern->append(match.prefix()); Status status = ProcessSymbol(match, symbols_refs, raw_pattern); if (status != Status::kSuccess) { return status; } low = match[SymbolMatchIndex::kFullSubmatch].second; } return Status::kSuccess; } // Checks that there are no symbol redefinitions and the symbols uses matched // literal values are equal to corresponding matched definitions. Status CheckSymbolsMatchedValues(const MatchVector& symbols_refs, const std::smatch& match) const { // There is a one-to-one correspondence between matched subexpressions and // symbols refences in the vector (by construction). for (size_t vec_pos = 0, size = symbols_refs.size(); vec_pos < size; vec_pos++) { auto elem = symbols_refs[vec_pos]; auto map_iter = map_.find(elem.symbol_name()); if (elem.is_def()) { if (map_iter != std::end(map_)) { return Status::kRedefinition; } } else { DCHECK(map_iter != std::end(map_)); // We replaced use with matched definition value literal. DCHECK_EQ(map_iter->second.matched_value().compare(match[vec_pos + 1]), 0); } } return Status::kSuccess; } // Commits symbols definitions and their matched values to the symbol table. void CommitSymbolsDefinitions(const MatchVector& groups_vector, const std::smatch& match) { for (size_t vec_pos = 0, size = groups_vector.size(); vec_pos < size; vec_pos++) { size_t match_pos = vec_pos + 1; auto elem = groups_vector[vec_pos]; if (elem.is_def()) { auto emplace_res = map_.emplace(elem.symbol_name(), SymbolInfo(match[match_pos])); USE(emplace_res); // Silence warning about unused variable. DCHECK(emplace_res.second == true); } } } const std::regex symbol_ref_regex_; SymbolMap map_; }; bool CheckDisassemblyRegexPatterns( const char* function_name, const std::vector<std::string>& patterns_array); } // namespace internal } // namespace v8 #endif // V8_CCTEST_DISASM_REGEX_HELPER_H_