Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion include/tsutil/Regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,42 @@ class RegexMatches
_MatchDataPtr _match_data;
};

/// @brief Wrapper for PCRE2 match context
class RegexMatchContext
{
friend class Regex;

public:
/** Construct a new RegexMatchContext object.
*/
RegexMatchContext();
~RegexMatchContext();

/// uses pcre2_match_context_copy to duplicate.
RegexMatchContext(RegexMatchContext const &orig);
RegexMatchContext &operator=(RegexMatchContext const &orig);

RegexMatchContext(RegexMatchContext &&) = default;
RegexMatchContext &operator=(RegexMatchContext &&) = default;

/** Limits the amount of backtracking that can take place.
*/
void setMatchLimit(uint32_t limit);

/** Limits how far an unanchored search can advance in the subject string.
*/
void setOffsetLimit(uint32_t limit);

private:
/// @internal This wraps a void* so to avoid requiring a pcre2 include.
struct _MatchContext;
struct _MatchContextPtr {
void *_ptr = nullptr;
};

_MatchContextPtr _match_context;
};

/// @brief Wrapper for PCRE2 regular expression.
class Regex
{
Expand Down Expand Up @@ -179,14 +215,16 @@ class Regex
* @param subject String to match against.
* @param matches Place to store the capture groups.
* @param flags Match flags (e.g., RE_NOTEMPTY).
* @param optional context Match context (set matching limits).
* @return @c The number of capture groups. < 0 if an error occurred. 0 if the number of Matches is too small.
*
* It is safe to call this method concurrently on the same instance of @a this.
*
* Each capture group takes 3 elements of @a ovector, therefore @a ovecsize must
* be a multiple of 3 and at least three times the number of desired capture groups.
*/
int exec(std::string_view subject, RegexMatches &matches, uint32_t flags) const;
int exec(std::string_view subject, RegexMatches &matches, uint32_t flags,
RegexMatchContext const *const matchContext = nullptr) const;

/// @return The number of capture groups in the compiled pattern.
int get_capture_count();
Expand Down
2 changes: 1 addition & 1 deletion plugins/regex_remap/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@

add_atsplugin(regex_remap regex_remap.cc)

target_link_libraries(regex_remap PRIVATE PCRE::PCRE libswoc::libswoc)
target_link_libraries(regex_remap PRIVATE libswoc::libswoc)

verify_remap_plugin(regex_remap)
157 changes: 71 additions & 86 deletions plugins/regex_remap/regex_remap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,14 @@
#include "tscore/ink_time.h"
#include "tscore/ink_inet.h"

#ifdef HAVE_PCRE_PCRE_H
#include <pcre/pcre.h>
#else
#include <pcre.h>
#endif
#include "tsutil/Regex.h"

static const char *PLUGIN_NAME = "regex_remap";

// Constants
static const int OVECCOUNT = 30; // We support $0 - $9 x2 ints, and this needs to be 1.5x that
static const int MAX_SUBS = 32; // No more than 32 substitution variables in the subst string
static const int MATCHCOUNT = 15; // We support $0 - $9 x2 ints, and this needs to be 1.5x that
static const int MAX_SUBS = 32; // No more than 32 substitution variables in the subst string
static const int32_t REGEX_MATCH_LIMIT = 1750; // POOMA - also dependent on actual stack size. Crashes with previous value of 2047

// Substitutions other than regex matches
enum ExtraSubstitutions {
Expand Down Expand Up @@ -117,13 +114,6 @@ class RemapRegex
Dbg(dbg_ctl, "Calling destructor");
TSfree(_rex_string);
TSfree(_subst);

if (_rex) {
pcre_free(_rex);
}
if (_extra) {
pcre_free(_extra);
}
}

bool initialize(const std::string &reg, const std::string &sub, const std::string &opt);
Expand All @@ -140,25 +130,26 @@ class RemapRegex
fprintf(stderr, "[%s]: Regex %d ( %s ): %.2f%%\n", now, ix, _rex_string, 100.0 * _hits / max);
}

int compile(const char *&error, int &erroffset);
// Returns '0' on success
int compile(std::string &error, int &erroffset);

// Perform the regular expression matching against a string.
// number of matches, or '0' if failed
int
match(const char *str, int len, int ovector[])
match(std::string_view const str, RegexMatches &matches) const
{
return pcre_exec(_rex, // the compiled pattern
_extra, // Extra data from study (maybe)
str, // the subject string
len, // the length of the subject
0, // start at offset 0 in the subject
0, // default options
ovector, // output vector for substring information
OVECCOUNT); // number of elements in the output vector
TSAssert(nullptr != _match_context);
bool const stat = _rex.exec(str, matches, 0, _match_context);
if (stat) {
return matches.size();
} else {
Dbg(dbg_ctl, "Regex match failure: %.*s", (int)str.length(), str.data());
}
return 0;
}

// Substitutions
int get_lengths(const int ovector[], int lengths[], TSRemapRequestInfo *rri, UrlComponents *req_url);
int substitute(char dest[], const char *src, const int ovector[], const int lengths[], TSHttpTxn txnp, TSRemapRequestInfo *rri,
int get_lengths(RegexMatches const &matches, int lengths[], TSRemapRequestInfo *rri, UrlComponents *req_url);
int substitute(char dest[], RegexMatches const &matches, const int lengths[], TSHttpTxn txnp, TSRemapRequestInfo *rri,
UrlComponents *req_url, bool lowercase_substitutions);

// setter / getters for members the linked list.
Expand All @@ -173,6 +164,12 @@ class RemapRegex
return _next;
}

inline void
set_match_context(RegexMatchContext *const ctx)
{
_match_context = ctx;
}

// setter / getters for order number within the linked list
inline void
set_order(int order)
Expand Down Expand Up @@ -263,10 +260,10 @@ class RemapRegex

bool _lowercase_substitutions = false;

pcre *_rex = nullptr;
pcre_extra *_extra = nullptr;
RemapRegex *_next = nullptr;
TSHttpStatus _status = static_cast<TSHttpStatus>(0);
Regex _rex;
RegexMatchContext *_match_context = nullptr; // owned by RemapInstance
RemapRegex *_next = nullptr;
TSHttpStatus _status = static_cast<TSHttpStatus>(0);

int _active_timeout = -1;
int _no_activity_timeout = -1;
Expand Down Expand Up @@ -319,7 +316,7 @@ RemapRegex::initialize(const std::string &reg, const std::string &sub, const std

// These take an option 0|1 value, without value it implies 1
if (opt.compare(start, 8, "caseless") == 0) {
_options |= PCRE_CASELESS;
_options |= RE_CASE_INSENSITIVE;
} else if (opt.compare(start, 23, "lowercase_substitutions") == 0) {
_lowercase_substitutions = true;
} else if (opt.compare(start, 8, "strategy") == 0) {
Expand Down Expand Up @@ -386,41 +383,20 @@ RemapRegex::initialize(const std::string &reg, const std::string &sub, const std

// Compile and study the regular expression.
int
RemapRegex::compile(const char *&error, int &erroffset)
RemapRegex::compile(std::string &error, int &erroffset)
{
char *str;
int ccount;

// Initialize these in case they are not set.
error = "unknown error";
erroffset = -1;

_rex = pcre_compile(_rex_string, // the pattern
_options, // options
&error, // for error message
&erroffset, // for error offset
nullptr); // use default character tables

if (nullptr == _rex) {
return -1;
}

_extra = pcre_study(_rex, PCRE_STUDY_EXTRA_NEEDED, &error);
if (error != nullptr) {
return -1;
}

// POOMA - also dependent on actual stack size. Crashes with previous value of 2047,
_extra->match_limit_recursion = 1750;
_extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;

if (pcre_fullinfo(_rex, _extra, PCRE_INFO_CAPTURECOUNT, &ccount) != 0) {
error = "call to pcre_fullinfo() failed";
bool const restat = _rex.compile(_rex_string, error, erroffset, _options);
if (!restat) {
TSError("[%s] Error compiling : %s", PLUGIN_NAME, _rex_string);
return -1;
}

// Get some info for the string substitutions
str = _subst;
char *str = _subst;
_num_subs = 0;

while (str && *str) {
Expand Down Expand Up @@ -464,10 +440,12 @@ RemapRegex::compile(const char *&error, int &erroffset)
}

if (ix > -1) {
if ((ix < 10) && (ix > ccount)) {
error = "using unavailable captured substring ($n) in substitution";
return -1;
}
/*
if ((ix < 10) && (ix > matches.size())) {
error = "using unavailable captured substring ($n) in substitution";
return -1;
}
*/

_sub_ix[_num_subs] = ix;
_sub_pos[_num_subs] = (str - _subst);
Expand All @@ -487,15 +465,15 @@ RemapRegex::compile(const char *&error, int &erroffset)
// We also calculate a total length for the new string, which is the max length the
// substituted string can have (use it to allocate a buffer before calling substitute() ).
int
RemapRegex::get_lengths(const int ovector[], int lengths[], TSRemapRequestInfo *rri, UrlComponents *req_url)
RemapRegex::get_lengths(RegexMatches const &matches, int lengths[], TSRemapRequestInfo *rri, UrlComponents *req_url)
{
int len = _subst_len + 1; // Bigger then necessary

for (int i = 0; i < _num_subs; i++) {
int ix = _sub_ix[i];

if (ix < 10) {
lengths[ix] = ovector[2 * ix + 1] - ovector[2 * ix]; // -1 - -1 == 0
lengths[ix] = matches[ix].length();
len += lengths[ix];
} else {
int tmp_len;
Expand Down Expand Up @@ -541,8 +519,8 @@ RemapRegex::get_lengths(const int ovector[], int lengths[], TSRemapRequestInfo *
// regex that was matches, while $1 - $9 are the corresponding groups. Return the final
// length of the string as written to dest (not including the trailing '0').
int
RemapRegex::substitute(char dest[], const char *src, const int ovector[], const int lengths[], TSHttpTxn txnp,
TSRemapRequestInfo *rri, UrlComponents *req_url, bool lowercase_substitutions)
RemapRegex::substitute(char dest[], RegexMatches const &matches, const int lengths[], TSHttpTxn txnp, TSRemapRequestInfo *rri,
UrlComponents *req_url, bool lowercase_substitutions)
{
if (_num_subs > 0) {
char *p1 = dest;
Expand All @@ -556,7 +534,7 @@ RemapRegex::substitute(char dest[], const char *src, const int ovector[], const
memcpy(p1, p2, _sub_pos[i] - prev);
p1 += (_sub_pos[i] - prev);
if (ix < 10) {
memcpy(p1, src + ovector[2 * ix], lengths[ix]);
memcpy(p1, matches[ix].data(), matches[ix].length());
p1 += lengths[ix];
} else {
char buff[INET6_ADDRSTRLEN];
Expand Down Expand Up @@ -630,17 +608,18 @@ RemapRegex::substitute(char dest[], const char *src, const int ovector[], const
struct RemapInstance {
RemapInstance() : filename("unknown") {}

RemapRegex *first = nullptr;
RemapRegex *last = nullptr;
bool pristine_url = false;
bool profile = false;
bool method = false;
bool query_string = true;
bool host = false;
int hits = 0;
int misses = 0;
int failures = 0;
std::string filename;
RemapRegex *first = nullptr;
RemapRegex *last = nullptr;
RegexMatchContext match_context = {};
bool pristine_url = false;
bool profile = false;
bool method = false;
bool query_string = true;
bool host = false;
int hits = 0;
int misses = 0;
int failures = 0;
std::string filename;
};

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -783,11 +762,12 @@ TSRemapNewInstance(int argc, char *argv[], void **ih, char * /* errbuf ATS_UNUSE
continue;
}

const char *error;
std::string error;
int erroffset;
if (cur->compile(error, erroffset) < 0) {
Dbg(dbg_ctl, "Compiling regex: %s", regex.c_str());
if (0 != cur->compile(error, erroffset)) {
std::ostringstream oss;
oss << '[' << PLUGIN_NAME << "] PCRE failed in " << (ri->filename).c_str() << " (line " << lineno << ')';
oss << '[' << PLUGIN_NAME << "] Regex compile failed in " << (ri->filename).c_str() << " (line " << lineno << ')';
if (erroffset > 0) {
oss << " at offset " << erroffset;
}
Expand All @@ -801,6 +781,7 @@ TSRemapNewInstance(int argc, char *argv[], void **ih, char * /* errbuf ATS_UNUSE
} else {
Dbg(dbg_ctl, "Added regex=%s with subs=%s and options `%s'", regex.c_str(), subst.c_str(), options.c_str());
cur->set_order(++count);
cur->set_match_context(&ri->match_context);
auto tmp = cur.get();
if (ri->first == nullptr) {
ri->first = cur.release();
Expand All @@ -811,6 +792,8 @@ TSRemapNewInstance(int argc, char *argv[], void **ih, char * /* errbuf ATS_UNUSE
}
}

ri->match_context.setMatchLimit(REGEX_MATCH_LIMIT);

// Make sure we got something...
if (ri->first == nullptr) {
TSError("[%s] no regular expressions from the maps", PLUGIN_NAME);
Expand All @@ -823,6 +806,7 @@ TSRemapNewInstance(int argc, char *argv[], void **ih, char * /* errbuf ATS_UNUSE
void
TSRemapDeleteInstance(void *ih)
{
Dbg(dbg_ctl, "TSRemapDeleteInstance");
RemapInstance *ri = static_cast<RemapInstance *>(ih);
RemapRegex *re;
RemapRegex *tmp;
Expand Down Expand Up @@ -915,8 +899,7 @@ TSRemapDoRemap(void *ih, TSHttpTxn txnp, TSRemapRequestInfo *rri)
UrlComponents req_url;
req_url.populate(src_url.bufp, src_url.loc);

int ovector[OVECCOUNT];
int lengths[OVECCOUNT / 2 + 1];
int lengths[MATCHCOUNT + 1];
int dest_len;
TSRemapStatus retval = TSREMAP_DID_REMAP;
RemapRegex *re = ri->first;
Expand Down Expand Up @@ -963,12 +946,14 @@ TSRemapDoRemap(void *ih, TSHttpTxn txnp, TSRemapRequestInfo *rri)
match_buf[match_len] = '\0'; // NULL terminate the match string
Dbg(dbg_ctl, "Target match string is `%s'", match_buf);

RegexMatches matches(MATCHCOUNT);

// Apply the regular expressions, in order. First one wins.
while (re) {
// Since we check substitutions on parse time, we don't need to reset ovector
auto match_result = re->match(match_buf, match_len, ovector);
auto match_result = re->match(match_buf, matches);
if (match_result >= 0) {
int new_len = re->get_lengths(ovector, lengths, rri, &req_url);
int new_len = re->get_lengths(matches, lengths, rri, &req_url);

// Set timeouts
if (re->active_timeout_option() > (-1)) {
Expand Down Expand Up @@ -1040,7 +1025,7 @@ TSRemapDoRemap(void *ih, TSHttpTxn txnp, TSRemapRequestInfo *rri)
char *dest;

dest = static_cast<char *>(alloca(new_len + 8));
dest_len = re->substitute(dest, match_buf, ovector, lengths, txnp, rri, &req_url, lowercase_substitutions);
dest_len = re->substitute(dest, matches, lengths, txnp, rri, &req_url, lowercase_substitutions);

Dbg(dbg_ctl, "New URL is estimated to be %d bytes long, or less", new_len);
Dbg(dbg_ctl, "New URL is %s (length %d)", dest, dest_len);
Expand Down
Loading