diff --git a/addon/locale/fr/LC_MESSAGES/nvda.po b/addon/locale/fr/LC_MESSAGES/nvda.po index 8988250..b0044c3 100644 --- a/addon/locale/fr/LC_MESSAGES/nvda.po +++ b/addon/locale/fr/LC_MESSAGES/nvda.po @@ -6,126 +6,138 @@ msgid "" msgstr "" "Project-Id-Version: IBMTTS for NVDA\n" -"Report-Msgid-Bugs-To: 'nvda-translations@freelists.org'\n" -"POT-Creation-Date: 2023-01-10 16:31-0600\n" -"PO-Revision-Date: 2023-01-11 18:28+0100\n" -"Last-Translator: Michel Such \n" +"Report-Msgid-Bugs-To: 'nvda-translations@groups.io'\n" +"POT-Creation-Date: 2023-12-30 22:55-0600\n" +"PO-Revision-Date: 2024-01-03 16:03+0100\n" +"Last-Translator: Rémy Ruiz \n" "Language-Team: \n" "Language: fr\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=2; plural=(n > 1);\n" -"X-Generator: Poedit 3.2\n" +"X-Generator: Poedit 3.2.2\n" -#: addon\synthDrivers\_ibmeci.py:61 +#: addon\synthDrivers\_ibmeci.py:92 msgid "Castilian Spanish" msgstr "Espagnol Espagne" -#: addon\synthDrivers\_ibmeci.py:62 +#: addon\synthDrivers\_ibmeci.py:93 msgid "Latin American Spanish" msgstr "Espagnol Latino-Américain" -#: addon\synthDrivers\_ibmeci.py:63 +#: addon\synthDrivers\_ibmeci.py:94 msgid "Brazilian Portuguese" msgstr "Portugais Brésil" -#: addon\synthDrivers\_ibmeci.py:64 +#: addon\synthDrivers\_ibmeci.py:95 msgid "French" msgstr "Français" -#: addon\synthDrivers\_ibmeci.py:65 +#: addon\synthDrivers\_ibmeci.py:96 msgid "French Canadian" msgstr "Français Canada" -#: addon\synthDrivers\_ibmeci.py:66 +#: addon\synthDrivers\_ibmeci.py:97 msgid "Finnish" msgstr "Finnois" -#: addon\synthDrivers\_ibmeci.py:67 +#: addon\synthDrivers\_ibmeci.py:98 msgid "Chinese" msgstr "Chinois" -#: addon\synthDrivers\_ibmeci.py:68 +#: addon\synthDrivers\_ibmeci.py:99 msgid "Japanese" msgstr "Japonais" -#: addon\synthDrivers\_ibmeci.py:69 +#: addon\synthDrivers\_ibmeci.py:100 msgid "Korean" msgstr "Coréen" -#: addon\synthDrivers\_ibmeci.py:70 +#: addon\synthDrivers\_ibmeci.py:101 msgid "German" msgstr "Allemand" -#: addon\synthDrivers\_ibmeci.py:71 +#: addon\synthDrivers\_ibmeci.py:102 msgid "Italian" msgstr "Italien" -#: addon\synthDrivers\_ibmeci.py:72 +#: addon\synthDrivers\_ibmeci.py:103 msgid "American English" msgstr "Anglais Américain" -#: addon\synthDrivers\_ibmeci.py:73 +#: addon\synthDrivers\_ibmeci.py:104 msgid "British English" msgstr "Anglais Britannique" -#: addon\synthDrivers\_ibmeci.py:74 +#: addon\synthDrivers\_ibmeci.py:105 msgid "Swedish" msgstr "Suédois" -#: addon\synthDrivers\_ibmeci.py:75 +#: addon\synthDrivers\_ibmeci.py:106 msgid "Norwegian" msgstr "Norvégien" -#: addon\synthDrivers\_ibmeci.py:76 +#: addon\synthDrivers\_ibmeci.py:107 msgid "Danish" msgstr "Danois" -#: addon\synthDrivers\_ibmeci.py:77 +#: addon\synthDrivers\_ibmeci.py:108 msgid "Hong Kong Cantonese" msgstr "Hong Kong Cantonais" -#: addon\synthDrivers\ibmeci.py:169 +#: addon\synthDrivers\ibmeci.py:204 msgid "Rate boos&t" msgstr "Voix t&urbo" -#: addon\synthDrivers\ibmeci.py:171 +#: addon\synthDrivers\ibmeci.py:206 msgid "Hea&d size" msgstr "Taille de la &tête" -#: addon\synthDrivers\ibmeci.py:172 +#: addon\synthDrivers\ibmeci.py:207 msgid "Rou&ghness" msgstr "E&nrouement" -#: addon\synthDrivers\ibmeci.py:173 +#: addon\synthDrivers\ibmeci.py:208 msgid "Breathi&ness" msgstr "&Respiration" -#: addon\synthDrivers\ibmeci.py:174 +#: addon\synthDrivers\ibmeci.py:209 msgid "Enable backquote voice &tags" msgstr "Activer les &balises de changement de voix" -#: addon\synthDrivers\ibmeci.py:175 +#: addon\synthDrivers\ibmeci.py:210 msgid "Enable &abbreviation expansion" msgstr "Activer l'e&xpansion des abréviations" -#: addon\synthDrivers\ibmeci.py:176 +#: addon\synthDrivers\ibmeci.py:211 msgid "Enable phras&e prediction" msgstr "Activer la prédiction de phras&e" -#: addon\synthDrivers\ibmeci.py:177 -msgid "&Shorten pauses" -msgstr "&Réduire les pauses" +#: addon\synthDrivers\ibmeci.py:212 +msgid "&Pauses" +msgstr "&Pauses" -#: addon\synthDrivers\ibmeci.py:178 +#: addon\synthDrivers\ibmeci.py:213 msgid "Al&ways Send Current Speech Settings" msgstr "Tou&jours envoyer les paramètres vocaux" -#: addon\synthDrivers\ibmeci.py:179 +#: addon\synthDrivers\ibmeci.py:214 msgid "Sa&mple Rate" msgstr "Tau&x d'échantillonnage" +#: addon\synthDrivers\ibmeci.py:550 +msgid "Do not shorten" +msgstr "Ne pas réduire" + +#: addon\synthDrivers\ibmeci.py:551 +msgid "Shorten at end of text only" +msgstr "Réduire à la fin du texte uniquement" + +#: addon\synthDrivers\ibmeci.py:552 +msgid "Shorten all pauses" +msgstr "Réduire toutes les pauses" + #. Translators: a message dialog asking to retry or cancel when downloading a file. #: addon\globalPlugins\_ibmttsUtils.py:77 msgid "" @@ -169,8 +181,8 @@ msgstr "" #. Translators: The title displayed when copying IBMTTS files to Add-on was successful. #: addon\globalPlugins\_ibmttsUtils.py:138 #: addon\globalPlugins\_ibmttsUtils.py:165 -#: addon\globalPlugins\_ibmttsUtils.py:290 addon\globalPlugins\ibmtts.py:108 -#: addon\globalPlugins\ibmtts.py:136 +#: addon\globalPlugins\_ibmttsUtils.py:297 addon\globalPlugins\ibmtts.py:126 +#: addon\globalPlugins\ibmtts.py:154 msgid "Error" msgstr "Erreur" @@ -192,33 +204,33 @@ msgid "Failed to install add-on from %s" msgstr "Échec de l'installation de l'extension %s" #. Translators: The message displayed if the folder to store the downloaded file can't be created. -#: addon\globalPlugins\_ibmttsUtils.py:288 +#: addon\globalPlugins\_ibmttsUtils.py:295 msgid "Unable to create the folder to save the update file." msgstr "" "Impossible de créer le dossier pour enregistrer le fichier de mise à jour." -#: addon\globalPlugins\_ibmttsUtils.py:298 +#: addon\globalPlugins\_ibmttsUtils.py:305 #, python-format msgid "Downloading the new %s update" msgstr "Téléchargement de la mise à jour de %s" -#: addon\globalPlugins\_ibmttsUtils.py:299 +#: addon\globalPlugins\_ibmttsUtils.py:306 msgid "downloading" msgstr "Téléchargement en cours" #. Translators: The message displayed when no updates were found. -#: addon\globalPlugins\_ibmttsUtils.py:333 +#: addon\globalPlugins\_ibmttsUtils.py:340 #, python-format msgid "There are no updates available for the %s addon." msgstr "Aucune mise à jour disponible pour l'extension %s" #. Translators: The title displayed when no updates were found. -#: addon\globalPlugins\_ibmttsUtils.py:335 +#: addon\globalPlugins\_ibmttsUtils.py:342 msgid "No updates available" msgstr "Aucune mise à jour disponible" #. Translators: A message asking the user if they wish to update the add-on -#: addon\globalPlugins\_ibmttsUtils.py:342 +#: addon\globalPlugins\_ibmttsUtils.py:349 #, python-format msgid "" "A new version of %s was found. The new version is %s. Would you like to " @@ -228,59 +240,140 @@ msgstr "" "Souhaitez-vous mettre à jour cette extension maintenant ?" #. Translators: Title for message asking if the user wishes to update the add-on. -#: addon\globalPlugins\_ibmttsUtils.py:345 +#: addon\globalPlugins\_ibmttsUtils.py:352 msgid "Update add-on" msgstr "Mettre à jour l'extension" +#. Translators: The message displayed when installing an add-on package is prohibited, +#. because it requires a later version of NVDA than is currently installed. +#: addon\globalPlugins\_ibmttsUtils.py:391 +#, python-brace-format +msgid "" +"Installation of {summary} {version} has been blocked. The minimum NVDA " +"version required for this add-on is {minimumNVDAVersion}, your current NVDA " +"version is {NVDAVersion}" +msgstr "" +"L'installation de {summary} {version} a été bloquée. La version minimale de " +"NVDA requise pour cette extension est {minimumNVDAVersion}, votre version " +"actuelle de NVDA est {NVDAVersion}" + +#. Translators: The title of a dialog presented when an error occurs. +#. Translators: The title of the dialog presented when the add-on is too old. +#: addon\globalPlugins\_ibmttsUtils.py:403 +#: addon\globalPlugins\_ibmttsUtils.py:422 +msgid "Add-on not compatible" +msgstr "Extension non compatible" + +#. Translators: A message informing the user that this addon can not be installed +#. because it is not compatible. +#: addon\globalPlugins\_ibmttsUtils.py:412 +#, python-brace-format +msgid "" +"Installation of {summary} {version} has been blocked. An updated version of " +"this add-on is required, the minimum add-on API supported by this version of " +"NVDA is {backCompatToAPIVersion}" +msgstr "" +"L'installation de {summary} {version} a été bloquée. Une version mise à jour " +"de cette extension est requise, l'API minimum de cette extension prise en " +"charge par cette version de NVDA est {backCompatToAPIVersion}" + +#: addon\globalPlugins\_ibmttsUtils.py:436 +msgid "&Not now" +msgstr "&Pas maintenant" + +#: addon\globalPlugins\_ibmttsUtils.py:447 +#, python-format +msgid "Request for contributions to %s" +msgstr "Demande de contributions pour %s" + +#: addon\globalPlugins\_ibmttsUtils.py:448 +msgid "" +"Creating add-ons demands substantial time and effort. With limited job " +"prospects in my country, your donations could significantly aid in " +"dedicating more time to developing free plugins for the community.\n" +"Your contribution would support the development of this and other free " +"projects.\n" +"Would you like to contribute to this cause? Select from our available " +"payment methods below. You will be redirected to the corresponding website " +"to complete your donation.\n" +"Thank you for your support and generosity." +msgstr "" +"La création d'extensions demande beaucoup de temps et d’efforts. Les " +"perspectives d'emploi étant limitées dans mon pays, vos dons pourraient " +"contribuer de manière significative à consacrer plus de temps au " +"développement d'extensions gratuites pour la communauté.\n" +"Votre contribution soutiendrait le développement de ce projet et d'autres " +"projets gratuits.\n" +"Vous souhaitez contribuer à cette cause ? Choisissez parmi nos méthodes de " +"paiement disponibles ci-dessous. Vous serez redirigé vers le site " +"correspondant pour finaliser votre don.\n" +"Merci de votre soutien et de votre générosité." + +#: addon\globalPlugins\ibmtts.py:25 addon\installTasks.py:21 +msgid "Using Paypal" +msgstr "Utiliser Paypal" + +#: addon\globalPlugins\ibmtts.py:29 addon\installTasks.py:25 +msgid "using Co-fi" +msgstr "Utiliser Co-fi" + +#: addon\globalPlugins\ibmtts.py:33 addon\installTasks.py:29 +msgid "See more methods on my github Page" +msgstr "Voir plus de méthodes sur ma page GitHub" + #. Translators: This is the label for the IBMTTS settings category in NVDA Settings screen. -#: addon\globalPlugins\ibmtts.py:26 +#: addon\globalPlugins\ibmtts.py:41 msgid "IBMTTS" msgstr "IBMTTS" -#: addon\globalPlugins\ibmtts.py:33 +#: addon\globalPlugins\ibmtts.py:48 msgid "&Automatically check for updates for IBMTTS" msgstr "Rechercher &automatiquement les mises à jour pour IBMTTS" #. Translators: This is the button to check for new updates of the add-on. -#: addon\globalPlugins\ibmtts.py:36 +#: addon\globalPlugins\ibmtts.py:51 msgid "&Check for update" msgstr "&Rechercher une mise à jour" #. Translators: This is the label for the IBMTTS folder address. -#: addon\globalPlugins\ibmtts.py:39 +#: addon\globalPlugins\ibmtts.py:54 msgid "IBMTTS folder address" msgstr "Adresse du dossier IBMTTS" #. Translators: This is the label for the IBMTTS library name. -#: addon\globalPlugins\ibmtts.py:41 +#: addon\globalPlugins\ibmtts.py:56 msgid "IBMTTS library name (dll)" msgstr "Nom de la bibliothèque IBMTTS (dll)" #. Translators: This is the button to explore and find for an IBMTTS library and files. -#: addon\globalPlugins\ibmtts.py:43 +#: addon\globalPlugins\ibmtts.py:58 msgid "&Browse for IBMTTS library..." msgstr "&Rechercher une bibliothèque IBMTTS..." #. Translators: This is the button to copy external IBMTTS files into synth driver Add-on. -#: addon\globalPlugins\ibmtts.py:46 +#: addon\globalPlugins\ibmtts.py:61 msgid "" "&Copy IBMTTS files in an add-on (may not work for some IBMTTS distributions)" msgstr "" "&Copier les fichiers IBMTTS dans une extension. (Cela ne fonctionne peut-" "être pas pour certaines distributions IBMTTS)" -#: addon\globalPlugins\ibmtts.py:62 +#: addon\globalPlugins\ibmtts.py:63 +msgid "&Support IBMTTS add-on" +msgstr "&Supporter l'extension IBMTTS" + +#: addon\globalPlugins\ibmtts.py:79 msgid "Select the IBMTTS library (dll)" msgstr "Choisir la bibliothèque IBMTTS (dll)" #. Translators: the label for the dynamic link library extension (dll) file type -#: addon\globalPlugins\ibmtts.py:64 +#: addon\globalPlugins\ibmtts.py:81 #, python-brace-format msgid "dynamic link library (*.{ext})" msgstr "Bibliothèque de liens dynamiques (*.{ext})" #. Translators: The message displayed when the IBMTTS files folder and library name have been set. -#: addon\globalPlugins\ibmtts.py:74 +#: addon\globalPlugins\ibmtts.py:91 msgid "" "The IBMTTS files location has been set. If you want to use it with a " "portable version of NVDA, please use the \"Copy IBMTTS files into driver add-" @@ -292,12 +385,12 @@ msgstr "" #. Translators: The title displayed when the IBMTTS files folder and library name have been set. #. Translators: The title displayed when copying IBMTTS files to Add-on was successful. -#: addon\globalPlugins\ibmtts.py:76 addon\globalPlugins\ibmtts.py:130 +#: addon\globalPlugins\ibmtts.py:93 addon\globalPlugins\ibmtts.py:148 msgid "Success" msgstr "Succès" #. Translators: The message displayed in the dialog that inform you the specified library is invalid. -#: addon\globalPlugins\ibmtts.py:82 +#: addon\globalPlugins\ibmtts.py:100 msgid "" "The specified dll file seems to be an incorrect IBMTTS library. Would you " "like to select another library?" @@ -305,11 +398,11 @@ msgstr "" "Le fichier dll spécifié semble être une bibliothèque IBMTTS incorrecte. " "Voulez-vous choisir une autre bibliothèque ?" -#: addon\globalPlugins\ibmtts.py:83 +#: addon\globalPlugins\ibmtts.py:101 msgid "Error loading library" msgstr "Erreur lors du chargement de la bibliothèque" -#: addon\globalPlugins\ibmtts.py:91 +#: addon\globalPlugins\ibmtts.py:109 msgid "" "Are you sure to copy IBMTTS files to local NVDA installation and register a " "new add-on called \"eciLibraries\" to store the libraries? It may not work " @@ -327,11 +420,11 @@ msgstr "" "fichiers IBMTTS de NVDA. Celle-ci et \"eciLibraries\"" #. Translators: The title of the Asking dialog displayed when trying to copy IBMTTS files. -#: addon\globalPlugins\ibmtts.py:94 +#: addon\globalPlugins\ibmtts.py:112 msgid "Copy IBMTTS files" msgstr "Copie des fichiers IBMTTS en cours" -#: addon\globalPlugins\ibmtts.py:106 +#: addon\globalPlugins\ibmtts.py:124 msgid "" "Unable to copy the files because the source and destination paths are the " "same." @@ -340,19 +433,19 @@ msgstr "" "destination sont identiques." #. Translators: The title of the dialog presented while IBMTTS files are being copied. -#: addon\globalPlugins\ibmtts.py:117 +#: addon\globalPlugins\ibmtts.py:135 msgid "Copying files" msgstr "Copie des fichiers en cours" #. Translators: The message displayed while IBMTTS files are being copied. -#: addon\globalPlugins\ibmtts.py:119 +#: addon\globalPlugins\ibmtts.py:137 msgid "Please wait while IBMTTS files are copied into add-on." msgstr "" "Veuillez patienter pendant que les fichiers IBMTTS sont copiés dans " "l'extension." #. Translators: The message displayed when copying IBMTTS files to Add-on was successful. -#: addon\globalPlugins\ibmtts.py:128 +#: addon\globalPlugins\ibmtts.py:146 msgid "" "Successfully copied IBMTTS files. The local copy will be used after restart " "NVDA." @@ -361,12 +454,12 @@ msgstr "" "redémarrage de NVDA." #. Translators: The message displayed when errors were found while trying to copy IBMTTS files to Add-on. -#: addon\globalPlugins\ibmtts.py:136 +#: addon\globalPlugins\ibmtts.py:154 msgid "Error copying IBMTTS files" msgstr "Erreur lors de la copie des fichiers IBMTTS" #. Translators: the message shown if the driver can't find libraries during installation. -#: addon\installTasks.py:13 +#: addon\installTasks.py:71 msgid "" "The synthesizer won't be available until you set IBMTTS files. NVDA won't " "show this synthesizer in teh synthesizers lists because you need to set the " @@ -378,11 +471,11 @@ msgstr "" "IBMTTS. NVDA ne montrera pas ce synthétiseur dans la liste de synthétiseur " "car vous devez d'abord définir l'emplacement des fichiers du synthétiseur.\n" "\tPour ce faire ouvrir le dialogue Paramètres de NVDA, sélectionner la " -"catégorie IBMTTS et utiliser le bouton \"Rechercher une bibliothèque IBMTTS" -"\" pour sélectionner le dossier des fichiers IBMTTS.\n" +"catégorie IBMTTS et utiliser le bouton \"Rechercher une bibliothèque " +"IBMTTS\" pour sélectionner le dossier des fichiers IBMTTS.\n" #. Translators: title of message box when user is installing NVDA -#: addon\installTasks.py:24 +#: addon\installTasks.py:76 msgid "IBMTTS driver for NVDA" msgstr "Pilote IBMTTS pour NVDA" diff --git a/addon/synthDrivers/ibmeci.py b/addon/synthDrivers/ibmeci.py index 7c20d68..775ac69 100644 --- a/addon/synthDrivers/ibmeci.py +++ b/addon/synthDrivers/ibmeci.py @@ -3,9 +3,12 @@ # Author: David CM and others. #synthDrivers/ibmeci.py -import six, synthDriverHandler, languageHandler, os, re +import os, sys, six, synthDriverHandler, languageHandler, os +lib = os.path.join(os.path.dirname(__file__), "regex") +sys.path.append(lib) +import regex as re +sys.path.remove(lib) from synthDriverHandler import synthDoneSpeaking, SynthDriver, synthIndexReached, VoiceInfo - from collections import OrderedDict from six import string_types from logHandler import log @@ -48,10 +51,10 @@ #Crash words, formerly part of anticrash_res. re.compile(br'\b(.*?)c(ae|\xe6)sur(e)?', re.I): br'\1seizur', re.compile(br"\b(|\d+|\W+)h'(r|v)[e]", re.I): br"\1h \2e", - re.compile(br"\b(\w+[bdfhjlmnqrvyz])(h[he]s)([abcdefghjklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", + re.compile(br"\b(\w+[bdfhjlmnqrvz])(h[he]s)([abcdefghjklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", re.compile(br"\b(\w+[bdfhjlmnqrvz])(h[he]s)(iron+[degins]?)", re.I): br"\1 \2\3", - re.compile(br"\b(\w+'{1,}[bcdfghjklmnpqrstvwxyz])'*(h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", - re.compile(br"\b(\w+[bcdfghjklmnpqrstvwxyz])('{1,}h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", + re.compile(br"\b(\w+'{1,}[bcdfghjklmnpqrstvwxz])'*(h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", + re.compile(br"\b(\w+[bcdfghjklmnpqrstvwxz])('{1,}h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", re.compile(br"(\d):(\d\d[snrt][tdh])", re.I): br"\1 \2", re.compile(br"\b([bcdfghjklmnpqrstvwxz]+)'([bcdefghjklmnpqrstvwxz']+)'([drtv][aeiou]?)", re.I): br"\1 \2 \3", re.compile(br"\b(you+)'(re)+'([drv]e?)", re.I): br"\1 \2 \3", @@ -77,10 +80,10 @@ re.compile(br"\b(Mc)\s+([A-Z][a-z]|[A-Z][A-Z]+)"): br"\1\2", re.compile(br'\b(.*?)c(ae|\xe6)sur(e)?', re.I): br'\1seizur', re.compile(br"\b(|\d+|\W+)h'(r|v)[e]", re.I): br"\1h \2e", - re.compile(br"\b(\w+[bdfhjlmnqrvyz])(h[he]s)([abcdefghjklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", - re.compile(br"\b(\w+[bdfhjlmnqrvyz])(h[he]s)(iron+[degins]?)", re.I): br"\1 \2\3", - re.compile(br"\b(\w+'{1,}[bcdfghjklmnpqrstvwxyz])'*(h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", - re.compile(br"\b(\w+[bcdfghjklmnpqrstvwxyz])('{1,}h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", + re.compile(br"\b(\w+[bdfhjlmnqrvz])(h[he]s)([abcdefghjklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", + re.compile(br"\b(\w+[bdfhjlmnqrvz])(h[he]s)(iron+[degins]?)", re.I): br"\1 \2\3", + re.compile(br"\b(\w+'{1,}[bcdfghjklmnpqrstvwxz])'*(h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", + re.compile(br"\b(\w+[bcdfghjklmnpqrstvwxz])('{1,}h+[he]s)([abcdefghijklmnopqrstvwy]\w+)\b", re.I): br"\1 \2\3", re.compile(br"(\d):(\d\d[snrt][tdh])", re.I): br"\1 \2", re.compile(br"\b([bcdfghjklmnpqrstvwxz]+)'([bcdefghjklmnpqrstvwxz']+)'([drtv][aeiou]?)", re.I): br"\1 \2 \3", re.compile(br"\b(you+)'(re)+'([drv]e?)", re.I): br"\1 \2 \3", @@ -131,6 +134,8 @@ re.compile(br'(\d{1,2}):(00):(\d{1,2})'): br'\1:\2 \3', } french_fixes = { + # Fix for numbers starting with one or sevral 0. + re.compile(br"(?:(?<=\b[1-9]\d{0,2}[\.\s](?:\d{3}[\.\s])*)(?!\d{3}\b)|(?=3.8 +Description-Content-Type: text/x-rst +License-File: LICENSE.txt + +Introduction +------------ + +This regex implementation is backwards-compatible with the standard 're' module, but offers additional functionality. + +Python 2 +-------- + +Python 2 is no longer supported. The last release that supported Python 2 was 2021.11.10. + +PyPy +---- + +This module is targeted at CPython. It expects that all codepoints are the same width, so it won't behave properly with PyPy outside U+0000..U+007F because PyPy stores strings as UTF-8. + +Multithreading +-------------- + +The regex module releases the GIL during matching on instances of the built-in (immutable) string classes, enabling other Python threads to run concurrently. It is also possible to force the regex module to release the GIL during matching by calling the matching methods with the keyword argument ``concurrent=True``. The behaviour is undefined if the string changes during matching, so use it *only* when it is guaranteed that that won't happen. + +Unicode +------- + +This module supports Unicode 16.0.0. Full Unicode case-folding is supported. + +Flags +----- + +There are 2 kinds of flag: scoped and global. Scoped flags can apply to only part of a pattern and can be turned on or off; global flags apply to the entire pattern and can only be turned on. + +The scoped flags are: ``ASCII (?a)``, ``FULLCASE (?f)``, ``IGNORECASE (?i)``, ``LOCALE (?L)``, ``MULTILINE (?m)``, ``DOTALL (?s)``, ``UNICODE (?u)``, ``VERBOSE (?x)``, ``WORD (?w)``. + +The global flags are: ``BESTMATCH (?b)``, ``ENHANCEMATCH (?e)``, ``POSIX (?p)``, ``REVERSE (?r)``, ``VERSION0 (?V0)``, ``VERSION1 (?V1)``. + +If neither the ``ASCII``, ``LOCALE`` nor ``UNICODE`` flag is specified, it will default to ``UNICODE`` if the regex pattern is a Unicode string and ``ASCII`` if it's a bytestring. + +The ``ENHANCEMATCH`` flag makes fuzzy matching attempt to improve the fit of the next match that it finds. + +The ``BESTMATCH`` flag makes fuzzy matching search for the best match instead of the next match. + +Old vs new behaviour +-------------------- + +In order to be compatible with the re module, this module has 2 behaviours: + +* **Version 0** behaviour (old behaviour, compatible with the re module): + + Please note that the re module's behaviour may change over time, and I'll endeavour to match that behaviour in version 0. + + * Indicated by the ``VERSION0`` flag. + + * Zero-width matches are not handled correctly in the re module before Python 3.7. The behaviour in those earlier versions is: + + * ``.split`` won't split a string at a zero-width match. + + * ``.sub`` will advance by one character after a zero-width match. + + * Inline flags apply to the entire pattern, and they can't be turned off. + + * Only simple sets are supported. + + * Case-insensitive matches in Unicode use simple case-folding by default. + +* **Version 1** behaviour (new behaviour, possibly different from the re module): + + * Indicated by the ``VERSION1`` flag. + + * Zero-width matches are handled correctly. + + * Inline flags apply to the end of the group or pattern, and they can be turned off. + + * Nested sets and set operations are supported. + + * Case-insensitive matches in Unicode use full case-folding by default. + +If no version is specified, the regex module will default to ``regex.DEFAULT_VERSION``. + +Case-insensitive matches in Unicode +----------------------------------- + +The regex module supports both simple and full case-folding for case-insensitive matches in Unicode. Use of full case-folding can be turned on using the ``FULLCASE`` flag. Please note that this flag affects how the ``IGNORECASE`` flag works; the ``FULLCASE`` flag itself does not turn on case-insensitive matching. + +Version 0 behaviour: the flag is off by default. + +Version 1 behaviour: the flag is on by default. + +Nested sets and set operations +------------------------------ + +It's not possible to support both simple sets, as used in the re module, and nested sets at the same time because of a difference in the meaning of an unescaped ``"["`` in a set. + +For example, the pattern ``[[a-z]--[aeiou]]`` is treated in the version 0 behaviour (simple sets, compatible with the re module) as: + +* Set containing "[" and the letters "a" to "z" + +* Literal "--" + +* Set containing letters "a", "e", "i", "o", "u" + +* Literal "]" + +but in the version 1 behaviour (nested sets, enhanced behaviour) as: + +* Set which is: + + * Set containing the letters "a" to "z" + +* but excluding: + + * Set containing the letters "a", "e", "i", "o", "u" + +Version 0 behaviour: only simple sets are supported. + +Version 1 behaviour: nested sets and set operations are supported. + +Notes on named groups +--------------------- + +All groups have a group number, starting from 1. + +Groups with the same group name will have the same group number, and groups with a different group name will have a different group number. + +The same name can be used by more than one group, with later captures 'overwriting' earlier captures. All the captures of the group will be available from the ``captures`` method of the match object. + +Group numbers will be reused across different branches of a branch reset, eg. ``(?|(first)|(second))`` has only group 1. If groups have different group names then they will, of course, have different group numbers, eg. ``(?|(?Pfirst)|(?Psecond))`` has group 1 ("foo") and group 2 ("bar"). + +In the regex ``(\s+)(?|(?P[A-Z]+)|(\w+) (?P[0-9]+)`` there are 2 groups: + +* ``(\s+)`` is group 1. + +* ``(?P[A-Z]+)`` is group 2, also called "foo". + +* ``(\w+)`` is group 2 because of the branch reset. + +* ``(?P[0-9]+)`` is group 2 because it's called "foo". + +If you want to prevent ``(\w+)`` from being group 2, you need to name it (different name, different group number). + +Additional features +------------------- + +The issue numbers relate to the Python bug tracker, except where listed otherwise. + +Added ``\p{Horiz_Space}`` and ``\p{Vert_Space}`` (`GitHub issue 477 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``\p{Horiz_Space}`` or ``\p{H}`` matches horizontal whitespace and ``\p{Vert_Space}`` or ``\p{V}`` matches vertical whitespace. + +Added support for lookaround in conditional pattern (`Hg issue 163 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The test of a conditional pattern can be a lookaround. + +.. sourcecode:: python + + >>> regex.match(r'(?(?=\d)\d+|\w+)', '123abc') + + >>> regex.match(r'(?(?=\d)\d+|\w+)', 'abc123') + + +This is not quite the same as putting a lookaround in the first branch of a pair of alternatives. + +.. sourcecode:: python + + >>> print(regex.match(r'(?:(?=\d)\d+\b|\w+)', '123abc')) + + >>> print(regex.match(r'(?(?=\d)\d+\b|\w+)', '123abc')) + None + +In the first example, the lookaround matched, but the remainder of the first branch failed to match, and so the second branch was attempted, whereas in the second example, the lookaround matched, and the first branch failed to match, but the second branch was **not** attempted. + +Added POSIX matching (leftmost longest) (`Hg issue 150 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The POSIX standard for regex is to return the leftmost longest match. This can be turned on using the ``POSIX`` flag. + +.. sourcecode:: python + + >>> # Normal matching. + >>> regex.search(r'Mr|Mrs', 'Mrs') + + >>> regex.search(r'one(self)?(selfsufficient)?', 'oneselfsufficient') + + >>> # POSIX matching. + >>> regex.search(r'(?p)Mr|Mrs', 'Mrs') + + >>> regex.search(r'(?p)one(self)?(selfsufficient)?', 'oneselfsufficient') + + +Note that it will take longer to find matches because when it finds a match at a certain position, it won't return that immediately, but will keep looking to see if there's another longer match there. + +Added ``(?(DEFINE)...)`` (`Hg issue 152 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If there's no group called "DEFINE", then ... will be ignored except that any groups defined within it can be called and that the normal rules for numbering groups still apply. + +.. sourcecode:: python + + >>> regex.search(r'(?(DEFINE)(?P\d+)(?P\w+))(?&quant) (?&item)', '5 elephants') + + +Added ``(*PRUNE)``, ``(*SKIP)`` and ``(*FAIL)`` (`Hg issue 153 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``(*PRUNE)`` discards the backtracking info up to that point. When used in an atomic group or a lookaround, it won't affect the enclosing pattern. + +``(*SKIP)`` is similar to ``(*PRUNE)``, except that it also sets where in the text the next attempt to match will start. When used in an atomic group or a lookaround, it won't affect the enclosing pattern. + +``(*FAIL)`` causes immediate backtracking. ``(*F)`` is a permitted abbreviation. + +Added ``\K`` (`Hg issue 151 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Keeps the part of the entire match after the position where ``\K`` occurred; the part before it is discarded. + +It does not affect what groups return. + +.. sourcecode:: python + + >>> m = regex.search(r'(\w\w\K\w\w\w)', 'abcdef') + >>> m[0] + 'cde' + >>> m[1] + 'abcde' + >>> + >>> m = regex.search(r'(?r)(\w\w\K\w\w\w)', 'abcdef') + >>> m[0] + 'bc' + >>> m[1] + 'bcdef' + +Added capture subscripting for ``expandf`` and ``subf``/``subfn`` (`Hg issue 133 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can use subscripting to get the captures of a repeated group. + +.. sourcecode:: python + + >>> m = regex.match(r"(\w)+", "abc") + >>> m.expandf("{1}") + 'c' + >>> m.expandf("{1[0]} {1[1]} {1[2]}") + 'a b c' + >>> m.expandf("{1[-1]} {1[-2]} {1[-3]}") + 'c b a' + >>> + >>> m = regex.match(r"(?P\w)+", "abc") + >>> m.expandf("{letter}") + 'c' + >>> m.expandf("{letter[0]} {letter[1]} {letter[2]}") + 'a b c' + >>> m.expandf("{letter[-1]} {letter[-2]} {letter[-3]}") + 'c b a' + +Added support for referring to a group by number using ``(?P=...)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This is in addition to the existing ``\g<...>``. + +Fixed the handling of locale-sensitive regexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``LOCALE`` flag is intended for legacy code and has limited support. You're still recommended to use Unicode instead. + +Added partial matches (`Hg issue 102 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A partial match is one that matches up to the end of string, but that string has been truncated and you want to know whether a complete match could be possible if the string had not been truncated. + +Partial matches are supported by ``match``, ``search``, ``fullmatch`` and ``finditer`` with the ``partial`` keyword argument. + +Match objects have a ``partial`` attribute, which is ``True`` if it's a partial match. + +For example, if you wanted a user to enter a 4-digit number and check it character by character as it was being entered: + +.. sourcecode:: python + + >>> pattern = regex.compile(r'\d{4}') + + >>> # Initially, nothing has been entered: + >>> print(pattern.fullmatch('', partial=True)) + + + >>> # An empty string is OK, but it's only a partial match. + >>> # The user enters a letter: + >>> print(pattern.fullmatch('a', partial=True)) + None + >>> # It'll never match. + + >>> # The user deletes that and enters a digit: + >>> print(pattern.fullmatch('1', partial=True)) + + >>> # It matches this far, but it's only a partial match. + + >>> # The user enters 2 more digits: + >>> print(pattern.fullmatch('123', partial=True)) + + >>> # It matches this far, but it's only a partial match. + + >>> # The user enters another digit: + >>> print(pattern.fullmatch('1234', partial=True)) + + >>> # It's a complete match. + + >>> # If the user enters another digit: + >>> print(pattern.fullmatch('12345', partial=True)) + None + >>> # It's no longer a match. + + >>> # This is a partial match: + >>> pattern.match('123', partial=True).partial + True + + >>> # This is a complete match: + >>> pattern.match('1233', partial=True).partial + False + +``*`` operator not working correctly with sub() (`Hg issue 106 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sometimes it's not clear how zero-width matches should be handled. For example, should ``.*`` match 0 characters directly after matching >0 characters? + +.. sourcecode:: python + + >>> regex.sub('.*', 'x', 'test') + 'xx' + >>> regex.sub('.*?', '|', 'test') + '|||||||||' + +Added ``capturesdict`` (`Hg issue 86 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``capturesdict`` is a combination of ``groupdict`` and ``captures``: + +``groupdict`` returns a dict of the named groups and the last capture of those groups. + +``captures`` returns a list of all the captures of a group + +``capturesdict`` returns a dict of the named groups and lists of all the captures of those groups. + +.. sourcecode:: python + + >>> m = regex.match(r"(?:(?P\w+) (?P\d+)\n)+", "one 1\ntwo 2\nthree 3\n") + >>> m.groupdict() + {'word': 'three', 'digits': '3'} + >>> m.captures("word") + ['one', 'two', 'three'] + >>> m.captures("digits") + ['1', '2', '3'] + >>> m.capturesdict() + {'word': ['one', 'two', 'three'], 'digits': ['1', '2', '3']} + +Added ``allcaptures`` and ``allspans`` (`Git issue 474 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``allcaptures`` returns a list of all the captures of all the groups. + +``allspans`` returns a list of all the spans of the all captures of all the groups. + +.. sourcecode:: python + + >>> m = regex.match(r"(?:(?P\w+) (?P\d+)\n)+", "one 1\ntwo 2\nthree 3\n") + >>> m.allcaptures() + (['one 1\ntwo 2\nthree 3\n'], ['one', 'two', 'three'], ['1', '2', '3']) + >>> m.allspans() + ([(0, 20)], [(0, 3), (6, 9), (12, 17)], [(4, 5), (10, 11), (18, 19)]) + +Allow duplicate names of groups (`Hg issue 87 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Group names can be duplicated. + +.. sourcecode:: python + + >>> # With optional groups: + >>> + >>> # Both groups capture, the second capture 'overwriting' the first. + >>> m = regex.match(r"(?P\w+)? or (?P\w+)?", "first or second") + >>> m.group("item") + 'second' + >>> m.captures("item") + ['first', 'second'] + >>> # Only the second group captures. + >>> m = regex.match(r"(?P\w+)? or (?P\w+)?", " or second") + >>> m.group("item") + 'second' + >>> m.captures("item") + ['second'] + >>> # Only the first group captures. + >>> m = regex.match(r"(?P\w+)? or (?P\w+)?", "first or ") + >>> m.group("item") + 'first' + >>> m.captures("item") + ['first'] + >>> + >>> # With mandatory groups: + >>> + >>> # Both groups capture, the second capture 'overwriting' the first. + >>> m = regex.match(r"(?P\w*) or (?P\w*)?", "first or second") + >>> m.group("item") + 'second' + >>> m.captures("item") + ['first', 'second'] + >>> # Again, both groups capture, the second capture 'overwriting' the first. + >>> m = regex.match(r"(?P\w*) or (?P\w*)", " or second") + >>> m.group("item") + 'second' + >>> m.captures("item") + ['', 'second'] + >>> # And yet again, both groups capture, the second capture 'overwriting' the first. + >>> m = regex.match(r"(?P\w*) or (?P\w*)", "first or ") + >>> m.group("item") + '' + >>> m.captures("item") + ['first', ''] + +Added ``fullmatch`` (`issue #16203 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``fullmatch`` behaves like ``match``, except that it must match all of the string. + +.. sourcecode:: python + + >>> print(regex.fullmatch(r"abc", "abc").span()) + (0, 3) + >>> print(regex.fullmatch(r"abc", "abcx")) + None + >>> print(regex.fullmatch(r"abc", "abcx", endpos=3).span()) + (0, 3) + >>> print(regex.fullmatch(r"abc", "xabcy", pos=1, endpos=4).span()) + (1, 4) + >>> + >>> regex.match(r"a.*?", "abcd").group(0) + 'a' + >>> regex.fullmatch(r"a.*?", "abcd").group(0) + 'abcd' + +Added ``subf`` and ``subfn`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``subf`` and ``subfn`` are alternatives to ``sub`` and ``subn`` respectively. When passed a replacement string, they treat it as a format string. + +.. sourcecode:: python + + >>> regex.subf(r"(\w+) (\w+)", "{0} => {2} {1}", "foo bar") + 'foo bar => bar foo' + >>> regex.subf(r"(?P\w+) (?P\w+)", "{word2} {word1}", "foo bar") + 'bar foo' + +Added ``expandf`` to match object +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``expandf`` is an alternative to ``expand``. When passed a replacement string, it treats it as a format string. + +.. sourcecode:: python + + >>> m = regex.match(r"(\w+) (\w+)", "foo bar") + >>> m.expandf("{0} => {2} {1}") + 'foo bar => bar foo' + >>> + >>> m = regex.match(r"(?P\w+) (?P\w+)", "foo bar") + >>> m.expandf("{word2} {word1}") + 'bar foo' + +Detach searched string +^^^^^^^^^^^^^^^^^^^^^^ + +A match object contains a reference to the string that was searched, via its ``string`` attribute. The ``detach_string`` method will 'detach' that string, making it available for garbage collection, which might save valuable memory if that string is very large. + +.. sourcecode:: python + + >>> m = regex.search(r"\w+", "Hello world") + >>> print(m.group()) + Hello + >>> print(m.string) + Hello world + >>> m.detach_string() + >>> print(m.group()) + Hello + >>> print(m.string) + None + +Recursive patterns (`Hg issue 27 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Recursive and repeated patterns are supported. + +``(?R)`` or ``(?0)`` tries to match the entire regex recursively. ``(?1)``, ``(?2)``, etc, try to match the relevant group. + +``(?&name)`` tries to match the named group. + +.. sourcecode:: python + + >>> regex.match(r"(Tarzan|Jane) loves (?1)", "Tarzan loves Jane").groups() + ('Tarzan',) + >>> regex.match(r"(Tarzan|Jane) loves (?1)", "Jane loves Tarzan").groups() + ('Jane',) + + >>> m = regex.search(r"(\w)(?:(?R)|(\w?))\1", "kayak") + >>> m.group(0, 1, 2) + ('kayak', 'k', None) + +The first two examples show how the subpattern within the group is reused, but is _not_ itself a group. In other words, ``"(Tarzan|Jane) loves (?1)"`` is equivalent to ``"(Tarzan|Jane) loves (?:Tarzan|Jane)"``. + +It's possible to backtrack into a recursed or repeated group. + +You can't call a group if there is more than one group with that group name or group number (``"ambiguous group reference"``). + +The alternative forms ``(?P>name)`` and ``(?P&name)`` are also supported. + +Full Unicode case-folding is supported +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In version 1 behaviour, the regex module uses full case-folding when performing case-insensitive matches in Unicode. + +.. sourcecode:: python + + >>> regex.match(r"(?iV1)strasse", "stra\N{LATIN SMALL LETTER SHARP S}e").span() + (0, 6) + >>> regex.match(r"(?iV1)stra\N{LATIN SMALL LETTER SHARP S}e", "STRASSE").span() + (0, 7) + +In version 0 behaviour, it uses simple case-folding for backward compatibility with the re module. + +Approximate "fuzzy" matching (`Hg issue 12 `_, `Hg issue 41 `_, `Hg issue 109 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Regex usually attempts an exact match, but sometimes an approximate, or "fuzzy", match is needed, for those cases where the text being searched may contain errors in the form of inserted, deleted or substituted characters. + +A fuzzy regex specifies which types of errors are permitted, and, optionally, either the minimum and maximum or only the maximum permitted number of each type. (You cannot specify only a minimum.) + +The 3 types of error are: + +* Insertion, indicated by "i" + +* Deletion, indicated by "d" + +* Substitution, indicated by "s" + +In addition, "e" indicates any type of error. + +The fuzziness of a regex item is specified between "{" and "}" after the item. + +Examples: + +* ``foo`` match "foo" exactly + +* ``(?:foo){i}`` match "foo", permitting insertions + +* ``(?:foo){d}`` match "foo", permitting deletions + +* ``(?:foo){s}`` match "foo", permitting substitutions + +* ``(?:foo){i,s}`` match "foo", permitting insertions and substitutions + +* ``(?:foo){e}`` match "foo", permitting errors + +If a certain type of error is specified, then any type not specified will **not** be permitted. + +In the following examples I'll omit the item and write only the fuzziness: + +* ``{d<=3}`` permit at most 3 deletions, but no other types + +* ``{i<=1,s<=2}`` permit at most 1 insertion and at most 2 substitutions, but no deletions + +* ``{1<=e<=3}`` permit at least 1 and at most 3 errors + +* ``{i<=2,d<=2,e<=3}`` permit at most 2 insertions, at most 2 deletions, at most 3 errors in total, but no substitutions + +It's also possible to state the costs of each type of error and the maximum permitted total cost. + +Examples: + +* ``{2i+2d+1s<=4}`` each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4 + +* ``{i<=1,d<=1,s<=1,2i+2d+1s<=4}`` at most 1 insertion, at most 1 deletion, at most 1 substitution; each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4 + +You can also use "<" instead of "<=" if you want an exclusive minimum or maximum. + +You can add a test to perform on a character that's substituted or inserted. + +Examples: + +* ``{s<=2:[a-z]}`` at most 2 substitutions, which must be in the character set ``[a-z]``. + +* ``{s<=2,i<=3:\d}`` at most 2 substitutions, at most 3 insertions, which must be digits. + +By default, fuzzy matching searches for the first match that meets the given constraints. The ``ENHANCEMATCH`` flag will cause it to attempt to improve the fit (i.e. reduce the number of errors) of the match that it has found. + +The ``BESTMATCH`` flag will make it search for the best match instead. + +Further examples to note: + +* ``regex.search("(dog){e}", "cat and dog")[1]`` returns ``"cat"`` because that matches ``"dog"`` with 3 errors (an unlimited number of errors is permitted). + +* ``regex.search("(dog){e<=1}", "cat and dog")[1]`` returns ``" dog"`` (with a leading space) because that matches ``"dog"`` with 1 error, which is within the limit. + +* ``regex.search("(?e)(dog){e<=1}", "cat and dog")[1]`` returns ``"dog"`` (without a leading space) because the fuzzy search matches ``" dog"`` with 1 error, which is within the limit, and the ``(?e)`` then it attempts a better fit. + +In the first two examples there are perfect matches later in the string, but in neither case is it the first possible match. + +The match object has an attribute ``fuzzy_counts`` which gives the total number of substitutions, insertions and deletions. + +.. sourcecode:: python + + >>> # A 'raw' fuzzy match: + >>> regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts + (0, 0, 1) + >>> # 0 substitutions, 0 insertions, 1 deletion. + + >>> # A better match might be possible if the ENHANCEMATCH flag used: + >>> regex.fullmatch(r"(?e)(?:cats|cat){e<=1}", "cat").fuzzy_counts + (0, 0, 0) + >>> # 0 substitutions, 0 insertions, 0 deletions. + +The match object also has an attribute ``fuzzy_changes`` which gives a tuple of the positions of the substitutions, insertions and deletions. + +.. sourcecode:: python + + >>> m = regex.search('(fuu){i<=2,d<=2,e<=5}', 'anaconda foo bar') + >>> m + + >>> m.fuzzy_changes + ([], [7, 8], [10, 11]) + +What this means is that if the matched part of the string had been: + +.. sourcecode:: python + + 'anacondfuuoo bar' + +it would've been an exact match. + +However, there were insertions at positions 7 and 8: + +.. sourcecode:: python + + 'anaconda fuuoo bar' + ^^ + +and deletions at positions 10 and 11: + +.. sourcecode:: python + + 'anaconda f~~oo bar' + ^^ + +So the actual string was: + +.. sourcecode:: python + + 'anaconda foo bar' + +Named lists ``\L`` (`Hg issue 11 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There are occasions where you may want to include a list (actually, a set) of options in a regex. + +One way is to build the pattern like this: + +.. sourcecode:: python + + >>> p = regex.compile(r"first|second|third|fourth|fifth") + +but if the list is large, parsing the resulting regex can take considerable time, and care must also be taken that the strings are properly escaped and properly ordered, for example, "cats" before "cat". + +The new alternative is to use a named list: + +.. sourcecode:: python + + >>> option_set = ["first", "second", "third", "fourth", "fifth"] + >>> p = regex.compile(r"\L", options=option_set) + +The order of the items is irrelevant, they are treated as a set. The named lists are available as the ``.named_lists`` attribute of the pattern object : + +.. sourcecode:: python + + >>> print(p.named_lists) + {'options': frozenset({'third', 'first', 'fifth', 'fourth', 'second'})} + +If there are any unused keyword arguments, ``ValueError`` will be raised unless you tell it otherwise: + +.. sourcecode:: python + + >>> option_set = ["first", "second", "third", "fourth", "fifth"] + >>> p = regex.compile(r"\L", options=option_set, other_options=[]) + Traceback (most recent call last): + File "", line 1, in + File "C:\Python310\lib\site-packages\regex\regex.py", line 353, in compile + return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern) + File "C:\Python310\lib\site-packages\regex\regex.py", line 500, in _compile + complain_unused_args() + File "C:\Python310\lib\site-packages\regex\regex.py", line 483, in complain_unused_args + raise ValueError('unused keyword argument {!a}'.format(any_one)) + ValueError: unused keyword argument 'other_options' + >>> p = regex.compile(r"\L", options=option_set, other_options=[], ignore_unused=True) + >>> p = regex.compile(r"\L", options=option_set, other_options=[], ignore_unused=False) + Traceback (most recent call last): + File "", line 1, in + File "C:\Python310\lib\site-packages\regex\regex.py", line 353, in compile + return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern) + File "C:\Python310\lib\site-packages\regex\regex.py", line 500, in _compile + complain_unused_args() + File "C:\Python310\lib\site-packages\regex\regex.py", line 483, in complain_unused_args + raise ValueError('unused keyword argument {!a}'.format(any_one)) + ValueError: unused keyword argument 'other_options' + >>> + +Start and end of word +^^^^^^^^^^^^^^^^^^^^^ + +``\m`` matches at the start of a word. + +``\M`` matches at the end of a word. + +Compare with ``\b``, which matches at the start or end of a word. + +Unicode line separators +^^^^^^^^^^^^^^^^^^^^^^^ + +Normally the only line separator is ``\n`` (``\x0A``), but if the ``WORD`` flag is turned on then the line separators are ``\x0D\x0A``, ``\x0A``, ``\x0B``, ``\x0C`` and ``\x0D``, plus ``\x85``, ``\u2028`` and ``\u2029`` when working with Unicode. + +This affects the regex dot ``"."``, which, with the ``DOTALL`` flag turned off, matches any character except a line separator. It also affects the line anchors ``^`` and ``$`` (in multiline mode). + +Set operators +^^^^^^^^^^^^^ + +**Version 1 behaviour only** + +Set operators have been added, and a set ``[...]`` can include nested sets. + +The operators, in order of increasing precedence, are: + +* ``||`` for union ("x||y" means "x or y") + +* ``~~`` (double tilde) for symmetric difference ("x~~y" means "x or y, but not both") + +* ``&&`` for intersection ("x&&y" means "x and y") + +* ``--`` (double dash) for difference ("x--y" means "x but not y") + +Implicit union, ie, simple juxtaposition like in ``[ab]``, has the highest precedence. Thus, ``[ab&&cd]`` is the same as ``[[a||b]&&[c||d]]``. + +Examples: + +* ``[ab]`` # Set containing 'a' and 'b' + +* ``[a-z]`` # Set containing 'a' .. 'z' + +* ``[[a-z]--[qw]]`` # Set containing 'a' .. 'z', but not 'q' or 'w' + +* ``[a-z--qw]`` # Same as above + +* ``[\p{L}--QW]`` # Set containing all letters except 'Q' and 'W' + +* ``[\p{N}--[0-9]]`` # Set containing all numbers except '0' .. '9' + +* ``[\p{ASCII}&&\p{Letter}]`` # Set containing all characters which are ASCII and letter + +regex.escape (`issue #2650 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +regex.escape has an additional keyword parameter ``special_only``. When True, only 'special' regex characters, such as '?', are escaped. + +.. sourcecode:: python + + >>> regex.escape("foo!?", special_only=False) + 'foo\\!\\?' + >>> regex.escape("foo!?", special_only=True) + 'foo!\\?' + +regex.escape (`Hg issue 249 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +regex.escape has an additional keyword parameter ``literal_spaces``. When True, spaces are not escaped. + +.. sourcecode:: python + + >>> regex.escape("foo bar!?", literal_spaces=False) + 'foo\\ bar!\\?' + >>> regex.escape("foo bar!?", literal_spaces=True) + 'foo bar!\\?' + +Repeated captures (`issue #7132 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A match object has additional methods which return information on all the successful matches of a repeated group. These methods are: + +* ``matchobject.captures([group1, ...])`` + + * Returns a list of the strings matched in a group or groups. Compare with ``matchobject.group([group1, ...])``. + +* ``matchobject.starts([group])`` + + * Returns a list of the start positions. Compare with ``matchobject.start([group])``. + +* ``matchobject.ends([group])`` + + * Returns a list of the end positions. Compare with ``matchobject.end([group])``. + +* ``matchobject.spans([group])`` + + * Returns a list of the spans. Compare with ``matchobject.span([group])``. + +.. sourcecode:: python + + >>> m = regex.search(r"(\w{3})+", "123456789") + >>> m.group(1) + '789' + >>> m.captures(1) + ['123', '456', '789'] + >>> m.start(1) + 6 + >>> m.starts(1) + [0, 3, 6] + >>> m.end(1) + 9 + >>> m.ends(1) + [3, 6, 9] + >>> m.span(1) + (6, 9) + >>> m.spans(1) + [(0, 3), (3, 6), (6, 9)] + +Atomic grouping ``(?>...)`` (`issue #433030 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the following pattern subsequently fails, then the subpattern as a whole will fail. + +Possessive quantifiers +^^^^^^^^^^^^^^^^^^^^^^ + +``(?:...)?+`` ; ``(?:...)*+`` ; ``(?:...)++`` ; ``(?:...){min,max}+`` + +The subpattern is matched up to 'max' times. If the following pattern subsequently fails, then all the repeated subpatterns will fail as a whole. For example, ``(?:...)++`` is equivalent to ``(?>(?:...)+)``. + +Scoped flags (`issue #433028 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``(?flags-flags:...)`` + +The flags will apply only to the subpattern. Flags can be turned on or off. + +Definition of 'word' character (`issue #1693050 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The definition of a 'word' character has been expanded for Unicode. It conforms to the Unicode specification at ``http://www.unicode.org/reports/tr29/``. + +Variable-length lookbehind +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A lookbehind can match a variable-length string. + +Flags argument for regex.split, regex.sub and regex.subn (`issue #3482 `_) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``regex.split``, ``regex.sub`` and ``regex.subn`` support a 'flags' argument. + +Pos and endpos arguments for regex.sub and regex.subn +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``regex.sub`` and ``regex.subn`` support 'pos' and 'endpos' arguments. + +'Overlapped' argument for regex.findall and regex.finditer +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``regex.findall`` and ``regex.finditer`` support an 'overlapped' flag which permits overlapped matches. + +Splititer +^^^^^^^^^ + +``regex.splititer`` has been added. It's a generator equivalent of ``regex.split``. + +Subscripting match objects for groups +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A match object accepts access to the groups via subscripting and slicing: + +.. sourcecode:: python + + >>> m = regex.search(r"(?P.*?)(?P\d+)(?P.*)", "pqr123stu") + >>> print(m["before"]) + pqr + >>> print(len(m)) + 4 + >>> print(m[:]) + ('pqr123stu', 'pqr', '123', 'stu') + +Named groups +^^^^^^^^^^^^ + +Groups can be named with ``(?...)`` as well as the existing ``(?P...)``. + +Group references +^^^^^^^^^^^^^^^^ + +Groups can be referenced within a pattern with ``\g``. This also allows there to be more than 99 groups. + +Named characters ``\N{name}`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Named characters are supported. Note that only those known by Python's Unicode database will be recognised. + +Unicode codepoint properties, including scripts and blocks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``\p{property=value}``; ``\P{property=value}``; ``\p{value}`` ; ``\P{value}`` + +Many Unicode properties are supported, including blocks and scripts. ``\p{property=value}`` or ``\p{property:value}`` matches a character whose property ``property`` has value ``value``. The inverse of ``\p{property=value}`` is ``\P{property=value}`` or ``\p{^property=value}``. + +If the short form ``\p{value}`` is used, the properties are checked in the order: ``General_Category``, ``Script``, ``Block``, binary property: + +* ``Latin``, the 'Latin' script (``Script=Latin``). + +* ``BasicLatin``, the 'BasicLatin' block (``Block=BasicLatin``). + +* ``Alphabetic``, the 'Alphabetic' binary property (``Alphabetic=Yes``). + +A short form starting with ``Is`` indicates a script or binary property: + +* ``IsLatin``, the 'Latin' script (``Script=Latin``). + +* ``IsAlphabetic``, the 'Alphabetic' binary property (``Alphabetic=Yes``). + +A short form starting with ``In`` indicates a block property: + +* ``InBasicLatin``, the 'BasicLatin' block (``Block=BasicLatin``). + +POSIX character classes +^^^^^^^^^^^^^^^^^^^^^^^ + +``[[:alpha:]]``; ``[[:^alpha:]]`` + +POSIX character classes are supported. These are normally treated as an alternative form of ``\p{...}``. + +The exceptions are ``alnum``, ``digit``, ``punct`` and ``xdigit``, whose definitions are different from those of Unicode. + +``[[:alnum:]]`` is equivalent to ``\p{posix_alnum}``. + +``[[:digit:]]`` is equivalent to ``\p{posix_digit}``. + +``[[:punct:]]`` is equivalent to ``\p{posix_punct}``. + +``[[:xdigit:]]`` is equivalent to ``\p{posix_xdigit}``. + +Search anchor ``\G`` +^^^^^^^^^^^^^^^^^^^^ + +A search anchor has been added. It matches at the position where each search started/continued and can be used for contiguous matches or in negative variable-length lookbehinds to limit how far back the lookbehind goes: + +.. sourcecode:: python + + >>> regex.findall(r"\w{2}", "abcd ef") + ['ab', 'cd', 'ef'] + >>> regex.findall(r"\G\w{2}", "abcd ef") + ['ab', 'cd'] + +* The search starts at position 0 and matches 'ab'. + +* The search continues at position 2 and matches 'cd'. + +* The search continues at position 4 and fails to match any letters. + +* The anchor stops the search start position from being advanced, so there are no more results. + +Reverse searching +^^^^^^^^^^^^^^^^^ + +Searches can also work backwards: + +.. sourcecode:: python + + >>> regex.findall(r".", "abc") + ['a', 'b', 'c'] + >>> regex.findall(r"(?r).", "abc") + ['c', 'b', 'a'] + +Note that the result of a reverse search is not necessarily the reverse of a forward search: + +.. sourcecode:: python + + >>> regex.findall(r"..", "abcde") + ['ab', 'cd'] + >>> regex.findall(r"(?r)..", "abcde") + ['de', 'bc'] + +Matching a single grapheme ``\X`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The grapheme matcher is supported. It conforms to the Unicode specification at ``http://www.unicode.org/reports/tr29/``. + +Branch reset ``(?|...|...)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Group numbers will be reused across the alternatives, but groups with different names will have different group numbers. + +.. sourcecode:: python + + >>> regex.match(r"(?|(first)|(second))", "first").groups() + ('first',) + >>> regex.match(r"(?|(first)|(second))", "second").groups() + ('second',) + +Note that there is only one group. + +Default Unicode word boundary +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``WORD`` flag changes the definition of a 'word boundary' to that of a default Unicode word boundary. This applies to ``\b`` and ``\B``. + +Timeout +^^^^^^^ + +The matching methods and functions support timeouts. The timeout (in seconds) applies to the entire operation: + +.. sourcecode:: python + + >>> from time import sleep + >>> + >>> def fast_replace(m): + ... return 'X' + ... + >>> def slow_replace(m): + ... sleep(0.5) + ... return 'X' + ... + >>> regex.sub(r'[a-z]', fast_replace, 'abcde', timeout=2) + 'XXXXX' + >>> regex.sub(r'[a-z]', slow_replace, 'abcde', timeout=2) + Traceback (most recent call last): + File "", line 1, in + File "C:\Python310\lib\site-packages\regex\regex.py", line 278, in sub + return pat.sub(repl, string, count, pos, endpos, concurrent, timeout) + TimeoutError: regex timed out diff --git a/addon/synthDrivers/regex/regex-2024.11.6.dist-info/RECORD b/addon/synthDrivers/regex/regex-2024.11.6.dist-info/RECORD new file mode 100644 index 0000000..f8cee0d --- /dev/null +++ b/addon/synthDrivers/regex/regex-2024.11.6.dist-info/RECORD @@ -0,0 +1,10 @@ +regex/__init__.py,sha256=6giZBSRLmTZfvQrcVoS6MaL5gKcwtfZlSXATBex49lU,68 +regex/_regex.cp311-win32.pyd,sha256=My-18sxOzn5hR_Yf9E24Ka-CF_RxqurnNM6EuVwEPeI,676864 +regex/_regex_core.py,sha256=42irJI2RTw4e_1_MxKLAvToxtcVqohr_NUBsr99XAwM,145523 +regex/regex.py,sha256=kpdQLXvgjDPukHeYRACUThoyfEqLndsRgEn913R20s0,33429 +regex/test_regex.py,sha256=F4Ce-5HQuw4PY1A3lhs5lJGNXBxCuBZK4-jQcDY-RmM,226528 +regex-2024.11.6.dist-info/LICENSE.txt,sha256=PSIMBllLgmu6vxEDEvYPOF-Z5X5Sn6S55Tb3kJH4tMc,11792 +regex-2024.11.6.dist-info/METADATA,sha256=zk4B1w8xvPUEmjWRzaCfWl1PFA1RYke3zmMe9IwQRKU,41547 +regex-2024.11.6.dist-info/WHEEL,sha256=NAdS8ur5Dxg2OJgOdSs8LUmGdKrRvTenOWk-maunoig,97 +regex-2024.11.6.dist-info/top_level.txt,sha256=aQmiDMhNTF26cCK4_7D-qaVvhbxClG0wyCTnEhkzYBs,6 +regex-2024.11.6.dist-info/RECORD,, diff --git a/addon/synthDrivers/regex/regex-2024.11.6.dist-info/WHEEL b/addon/synthDrivers/regex/regex-2024.11.6.dist-info/WHEEL new file mode 100644 index 0000000..7e087c5 --- /dev/null +++ b/addon/synthDrivers/regex/regex-2024.11.6.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: setuptools (75.3.0) +Root-Is-Purelib: false +Tag: cp311-cp311-win32 + diff --git a/addon/synthDrivers/regex/regex-2024.11.6.dist-info/top_level.txt b/addon/synthDrivers/regex/regex-2024.11.6.dist-info/top_level.txt new file mode 100644 index 0000000..4f9256d --- /dev/null +++ b/addon/synthDrivers/regex/regex-2024.11.6.dist-info/top_level.txt @@ -0,0 +1 @@ +regex diff --git a/addon/synthDrivers/regex/regex/__init__.py b/addon/synthDrivers/regex/regex/__init__.py new file mode 100644 index 0000000..eb06564 --- /dev/null +++ b/addon/synthDrivers/regex/regex/__init__.py @@ -0,0 +1,3 @@ +from .regex import * +from . import regex +__all__ = regex.__all__ diff --git a/addon/synthDrivers/regex/regex/_regex.cp311-win32.pyd b/addon/synthDrivers/regex/regex/_regex.cp311-win32.pyd new file mode 100644 index 0000000..1987f0b Binary files /dev/null and b/addon/synthDrivers/regex/regex/_regex.cp311-win32.pyd differ diff --git a/addon/synthDrivers/regex/regex/_regex_core.py b/addon/synthDrivers/regex/regex/_regex_core.py new file mode 100644 index 0000000..b2ffeae --- /dev/null +++ b/addon/synthDrivers/regex/regex/_regex_core.py @@ -0,0 +1,4495 @@ +# +# Secret Labs' Regular Expression Engine core module +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# +# 2010-01-16 mrab Python front-end re-written and extended + +import enum +import string +import unicodedata +from collections import defaultdict + +import regex._regex as _regex + +__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", + "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", + "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", + "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error", + "Scanner", "RegexFlag"] + +# The regex exception. +class error(Exception): + """Exception raised for invalid regular expressions. + + Attributes: + + msg: The unformatted error message + pattern: The regular expression pattern + pos: The position in the pattern where compilation failed, or None + lineno: The line number where compilation failed, unless pos is None + colno: The column number where compilation failed, unless pos is None + """ + + def __init__(self, message, pattern=None, pos=None): + newline = '\n' if isinstance(pattern, str) else b'\n' + self.msg = message + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + self.lineno = pattern.count(newline, 0, pos) + 1 + self.colno = pos - pattern.rfind(newline, 0, pos) + + message = "{} at position {}".format(message, pos) + + if newline in pattern: + message += " (line {}, column {})".format(self.lineno, + self.colno) + + Exception.__init__(self, message) + +# The exception for when a positional flag has been turned on in the old +# behaviour. +class _UnscopedFlagSet(Exception): + pass + +# The exception for when parsing fails and we want to try something else. +class ParseError(Exception): + pass + +# The exception for when there isn't a valid first set. +class _FirstSetError(Exception): + pass + +# Flags. +class RegexFlag(enum.IntFlag): + A = ASCII = 0x80 # Assume ASCII locale. + B = BESTMATCH = 0x1000 # Best fuzzy match. + D = DEBUG = 0x200 # Print parsed pattern. + E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first + # fuzzy match. + F = FULLCASE = 0x4000 # Unicode full case-folding. + I = IGNORECASE = 0x2 # Ignore case. + L = LOCALE = 0x4 # Assume current 8-bit locale. + M = MULTILINE = 0x8 # Make anchors look for newline. + P = POSIX = 0x10000 # POSIX-style matching (leftmost longest). + R = REVERSE = 0x400 # Search backwards. + S = DOTALL = 0x10 # Make dot match newline. + U = UNICODE = 0x20 # Assume Unicode locale. + V0 = VERSION0 = 0x2000 # Old legacy behaviour. + V1 = VERSION1 = 0x100 # New enhanced behaviour. + W = WORD = 0x800 # Default Unicode word breaks. + X = VERBOSE = 0x40 # Ignore whitespace and comments. + T = TEMPLATE = 0x1 # Template (present because re module has it). + + def __repr__(self): + if self._name_ is not None: + return 'regex.%s' % self._name_ + + value = self._value_ + members = [] + negative = value < 0 + + if negative: + value = ~value + + for m in self.__class__: + if value & m._value_: + value &= ~m._value_ + members.append('regex.%s' % m._name_) + + if value: + members.append(hex(value)) + + res = '|'.join(members) + + if negative: + if len(members) > 1: + res = '~(%s)' % res + else: + res = '~%s' % res + + return res + + __str__ = object.__str__ + +globals().update(RegexFlag.__members__) + +DEFAULT_VERSION = VERSION1 + +_ALL_VERSIONS = VERSION0 | VERSION1 +_ALL_ENCODINGS = ASCII | LOCALE | UNICODE + +# The default flags for the various versions. +DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE} + +# The mask for the flags. +GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX | + REVERSE) +SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE | + _ALL_ENCODINGS) + +ALPHA = frozenset(string.ascii_letters) +DIGITS = frozenset(string.digits) +ALNUM = ALPHA | DIGITS +OCT_DIGITS = frozenset(string.octdigits) +HEX_DIGITS = frozenset(string.hexdigits) +SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""]) +NAMED_CHAR_PART = ALNUM | frozenset(" -") +PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.") +SET_OPS = ("||", "~~", "&&", "--") + +# The width of the code words inside the regex engine. +BYTES_PER_CODE = _regex.get_code_size() +BITS_PER_CODE = BYTES_PER_CODE * 8 + +# The repeat count which represents infinity. +UNLIMITED = (1 << BITS_PER_CODE) - 1 + +# The regular expression flags. +REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE, + "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE, + "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x": + VERBOSE} + +# The case flags. +CASE_FLAGS = FULLCASE | IGNORECASE +NOCASE = 0 +FULLIGNORECASE = FULLCASE | IGNORECASE + +FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE + +CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE, + FULLIGNORECASE: FULLIGNORECASE} + +# The number of digits in hexadecimal escapes. +HEX_ESCAPES = {"x": 2, "u": 4, "U": 8} + +# The names of the opcodes. +OPCODES = """ +FAILURE +SUCCESS +ANY +ANY_ALL +ANY_ALL_REV +ANY_REV +ANY_U +ANY_U_REV +ATOMIC +BOUNDARY +BRANCH +CALL_REF +CHARACTER +CHARACTER_IGN +CHARACTER_IGN_REV +CHARACTER_REV +CONDITIONAL +DEFAULT_BOUNDARY +DEFAULT_END_OF_WORD +DEFAULT_START_OF_WORD +END +END_OF_LINE +END_OF_LINE_U +END_OF_STRING +END_OF_STRING_LINE +END_OF_STRING_LINE_U +END_OF_WORD +FUZZY +GRAPHEME_BOUNDARY +GREEDY_REPEAT +GROUP +GROUP_CALL +GROUP_EXISTS +KEEP +LAZY_REPEAT +LOOKAROUND +NEXT +PROPERTY +PROPERTY_IGN +PROPERTY_IGN_REV +PROPERTY_REV +PRUNE +RANGE +RANGE_IGN +RANGE_IGN_REV +RANGE_REV +REF_GROUP +REF_GROUP_FLD +REF_GROUP_FLD_REV +REF_GROUP_IGN +REF_GROUP_IGN_REV +REF_GROUP_REV +SEARCH_ANCHOR +SET_DIFF +SET_DIFF_IGN +SET_DIFF_IGN_REV +SET_DIFF_REV +SET_INTER +SET_INTER_IGN +SET_INTER_IGN_REV +SET_INTER_REV +SET_SYM_DIFF +SET_SYM_DIFF_IGN +SET_SYM_DIFF_IGN_REV +SET_SYM_DIFF_REV +SET_UNION +SET_UNION_IGN +SET_UNION_IGN_REV +SET_UNION_REV +SKIP +START_OF_LINE +START_OF_LINE_U +START_OF_STRING +START_OF_WORD +STRING +STRING_FLD +STRING_FLD_REV +STRING_IGN +STRING_IGN_REV +STRING_REV +FUZZY_EXT +""" + +# Define the opcodes in a namespace. +class Namespace: + pass + +OP = Namespace() +for i, op in enumerate(OPCODES.split()): + setattr(OP, op, i) + +def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5): + """Make room in the given cache. + + Args: + cache_dict: The cache dictionary to modify. + args_dict: The dictionary of named list args used by patterns. + max_length: Maximum # of entries in cache_dict before it is shrunk. + divisor: Cache will shrink to max_length - 1/divisor*max_length items. + """ + # Toss out a fraction of the entries at random to make room for new ones. + # A random algorithm was chosen as opposed to simply cache_dict.popitem() + # as popitem could penalize the same regular expression repeatedly based + # on its internal hash value. Being random should spread the cache miss + # love around. + cache_keys = tuple(cache_dict.keys()) + overage = len(cache_keys) - max_length + if overage < 0: + # Cache is already within limits. Normally this should not happen + # but it could due to multithreading. + return + + number_to_toss = max_length // divisor + overage + + # The import is done here to avoid a circular dependency. + import random + if not hasattr(random, 'sample'): + # Do nothing while resolving the circular dependency: + # re->random->warnings->tokenize->string->re + return + + for doomed_key in random.sample(cache_keys, number_to_toss): + try: + del cache_dict[doomed_key] + except KeyError: + # Ignore problems if the cache changed from another thread. + pass + + # Rebuild the arguments and locale-sensitivity dictionaries. + args_dict.clear() + sensitivity_dict = {} + for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict): + args_dict[pattern, pattern_type, flags, default_version, locale] = args + try: + sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] + except KeyError: + pass + + locale_sensitive.clear() + locale_sensitive.update(sensitivity_dict) + +def _fold_case(info, string): + "Folds the case of a string." + flags = info.flags + if (flags & _ALL_ENCODINGS) == 0: + flags |= info.guess_encoding + + return _regex.fold_case(flags, string) + +def is_cased_i(info, char): + "Checks whether a character is cased." + return len(_regex.get_all_cases(info.flags, char)) > 1 + +def is_cased_f(flags, char): + "Checks whether a character is cased." + return len(_regex.get_all_cases(flags, char)) > 1 + +def _compile_firstset(info, fs): + "Compiles the firstset for the pattern." + reverse = bool(info.flags & REVERSE) + fs = _check_firstset(info, reverse, fs) + if not fs: + return [] + + # Compile the firstset. + return fs.compile(reverse) + +def _check_firstset(info, reverse, fs): + "Checks the firstset for the pattern." + if not fs or None in fs: + return None + + # If we ignore the case, for simplicity we won't build a firstset. + members = set() + case_flags = NOCASE + for i in fs: + if isinstance(i, Character) and not i.positive: + return None + +# if i.case_flags: +# if isinstance(i, Character): +# if is_cased_i(info, i.value): +# return [] +# elif isinstance(i, SetBase): +# return [] + case_flags |= i.case_flags + members.add(i.with_flags(case_flags=NOCASE)) + + if case_flags == (FULLCASE | IGNORECASE): + return None + + # Build the firstset. + fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE, + zerowidth=True) + fs = fs.optimise(info, reverse, in_set=True) + + return fs + +def _flatten_code(code): + "Flattens the code from a list of tuples." + flat_code = [] + for c in code: + flat_code.extend(c) + + return flat_code + +def make_case_flags(info): + "Makes the case flags." + flags = info.flags & CASE_FLAGS + + # Turn off FULLCASE if ASCII is turned on. + if info.flags & ASCII: + flags &= ~FULLCASE + + return flags + +def make_character(info, value, in_set=False): + "Makes a character literal." + if in_set: + # A character set is built case-sensitively. + return Character(value) + + return Character(value, case_flags=make_case_flags(info)) + +def make_ref_group(info, name, position): + "Makes a group reference." + return RefGroup(info, name, position, case_flags=make_case_flags(info)) + +def make_string_set(info, name): + "Makes a string set." + return StringSet(info, name, case_flags=make_case_flags(info)) + +def make_property(info, prop, in_set): + "Makes a property." + if in_set: + return prop + + return prop.with_flags(case_flags=make_case_flags(info)) + +def _parse_pattern(source, info): + "Parses a pattern, eg. 'a|b|c'." + branches = [parse_sequence(source, info)] + while source.match("|"): + branches.append(parse_sequence(source, info)) + + if len(branches) == 1: + return branches[0] + return Branch(branches) + +def parse_sequence(source, info): + "Parses a sequence, eg. 'abc'." + sequence = [None] + case_flags = make_case_flags(info) + while True: + saved_pos = source.pos + ch = source.get() + if ch in SPECIAL_CHARS: + if ch in ")|": + # The end of a sequence. At the end of the pattern ch is "". + source.pos = saved_pos + break + elif ch == "\\": + # An escape sequence outside a set. + sequence.append(parse_escape(source, info, False)) + elif ch == "(": + # A parenthesised subpattern or a flag. + element = parse_paren(source, info) + if element is None: + case_flags = make_case_flags(info) + else: + sequence.append(element) + elif ch == ".": + # Any character. + if info.flags & DOTALL: + sequence.append(AnyAll()) + elif info.flags & WORD: + sequence.append(AnyU()) + else: + sequence.append(Any()) + elif ch == "[": + # A character set. + sequence.append(parse_set(source, info)) + elif ch == "^": + # The start of a line or the string. + if info.flags & MULTILINE: + if info.flags & WORD: + sequence.append(StartOfLineU()) + else: + sequence.append(StartOfLine()) + else: + sequence.append(StartOfString()) + elif ch == "$": + # The end of a line or the string. + if info.flags & MULTILINE: + if info.flags & WORD: + sequence.append(EndOfLineU()) + else: + sequence.append(EndOfLine()) + else: + if info.flags & WORD: + sequence.append(EndOfStringLineU()) + else: + sequence.append(EndOfStringLine()) + elif ch in "?*+{": + # Looks like a quantifier. + counts = parse_quantifier(source, info, ch) + if counts: + # It _is_ a quantifier. + apply_quantifier(source, info, counts, case_flags, ch, + saved_pos, sequence) + sequence.append(None) + else: + # It's not a quantifier. Maybe it's a fuzzy constraint. + constraints = parse_fuzzy(source, info, ch, case_flags) + if constraints: + # It _is_ a fuzzy constraint. + apply_constraint(source, info, constraints, case_flags, + saved_pos, sequence) + sequence.append(None) + else: + # The element was just a literal. + sequence.append(Character(ord(ch), + case_flags=case_flags)) + else: + # A literal. + sequence.append(Character(ord(ch), case_flags=case_flags)) + else: + # A literal. + sequence.append(Character(ord(ch), case_flags=case_flags)) + + sequence = [item for item in sequence if item is not None] + return Sequence(sequence) + +def apply_quantifier(source, info, counts, case_flags, ch, saved_pos, + sequence): + element = sequence.pop() + if element is None: + if sequence: + raise error("multiple repeat", source.string, saved_pos) + raise error("nothing to repeat", source.string, saved_pos) + + if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)): + raise error("multiple repeat", source.string, saved_pos) + + min_count, max_count = counts + saved_pos = source.pos + ch = source.get() + if ch == "?": + # The "?" suffix that means it's a lazy repeat. + repeated = LazyRepeat + elif ch == "+": + # The "+" suffix that means it's a possessive repeat. + repeated = PossessiveRepeat + else: + # No suffix means that it's a greedy repeat. + source.pos = saved_pos + repeated = GreedyRepeat + + # Ignore the quantifier if it applies to a zero-width item or the number of + # repeats is fixed at 1. + if not element.is_empty() and (min_count != 1 or max_count != 1): + element = repeated(element, min_count, max_count) + + sequence.append(element) + +def apply_constraint(source, info, constraints, case_flags, saved_pos, + sequence): + element = sequence.pop() + if element is None: + raise error("nothing for fuzzy constraint", source.string, saved_pos) + + # If a group is marked as fuzzy then put all of the fuzzy part in the + # group. + if isinstance(element, Group): + element.subpattern = Fuzzy(element.subpattern, constraints) + sequence.append(element) + else: + sequence.append(Fuzzy(element, constraints)) + +_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} + +def parse_quantifier(source, info, ch): + "Parses a quantifier." + q = _QUANTIFIERS.get(ch) + if q: + # It's a quantifier. + return q + + if ch == "{": + # Looks like a limited repeated element, eg. 'a{2,3}'. + counts = parse_limited_quantifier(source) + if counts: + return counts + + return None + +def is_above_limit(count): + "Checks whether a count is above the maximum." + return count is not None and count >= UNLIMITED + +def parse_limited_quantifier(source): + "Parses a limited quantifier." + saved_pos = source.pos + min_count = parse_count(source) + if source.match(","): + max_count = parse_count(source) + + # No minimum means 0 and no maximum means unlimited. + min_count = int(min_count or 0) + max_count = int(max_count) if max_count else None + else: + if not min_count: + source.pos = saved_pos + return None + + min_count = max_count = int(min_count) + + if not source.match ("}"): + source.pos = saved_pos + return None + + if is_above_limit(min_count) or is_above_limit(max_count): + raise error("repeat count too big", source.string, saved_pos) + + if max_count is not None and min_count > max_count: + raise error("min repeat greater than max repeat", source.string, + saved_pos) + + return min_count, max_count + +def parse_fuzzy(source, info, ch, case_flags): + "Parses a fuzzy setting, if present." + saved_pos = source.pos + + if ch != "{": + return None + + constraints = {} + try: + parse_fuzzy_item(source, constraints) + while source.match(","): + parse_fuzzy_item(source, constraints) + except ParseError: + source.pos = saved_pos + return None + + if source.match(":"): + constraints["test"] = parse_fuzzy_test(source, info, case_flags) + + if not source.match("}"): + raise error("expected }", source.string, source.pos) + + return constraints + +def parse_fuzzy_item(source, constraints): + "Parses a fuzzy setting item." + saved_pos = source.pos + try: + parse_cost_constraint(source, constraints) + except ParseError: + source.pos = saved_pos + + parse_cost_equation(source, constraints) + +def parse_cost_constraint(source, constraints): + "Parses a cost constraint." + saved_pos = source.pos + ch = source.get() + if ch in ALPHA: + # Syntax: constraint [("<=" | "<") cost] + constraint = parse_constraint(source, constraints, ch) + + max_inc = parse_fuzzy_compare(source) + + if max_inc is None: + # No maximum cost. + constraints[constraint] = 0, None + else: + # There's a maximum cost. + cost_pos = source.pos + max_cost = parse_cost_limit(source) + + # Inclusive or exclusive limit? + if not max_inc: + max_cost -= 1 + + if max_cost < 0: + raise error("bad fuzzy cost limit", source.string, cost_pos) + + constraints[constraint] = 0, max_cost + elif ch in DIGITS: + # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost + source.pos = saved_pos + + # Minimum cost. + cost_pos = source.pos + min_cost = parse_cost_limit(source) + + min_inc = parse_fuzzy_compare(source) + if min_inc is None: + raise ParseError() + + constraint = parse_constraint(source, constraints, source.get()) + + max_inc = parse_fuzzy_compare(source) + if max_inc is None: + raise ParseError() + + # Maximum cost. + cost_pos = source.pos + max_cost = parse_cost_limit(source) + + # Inclusive or exclusive limits? + if not min_inc: + min_cost += 1 + if not max_inc: + max_cost -= 1 + + if not 0 <= min_cost <= max_cost: + raise error("bad fuzzy cost limit", source.string, cost_pos) + + constraints[constraint] = min_cost, max_cost + else: + raise ParseError() + +def parse_cost_limit(source): + "Parses a cost limit." + cost_pos = source.pos + digits = parse_count(source) + + try: + return int(digits) + except ValueError: + pass + + raise error("bad fuzzy cost limit", source.string, cost_pos) + +def parse_constraint(source, constraints, ch): + "Parses a constraint." + if ch not in "deis": + raise ParseError() + + if ch in constraints: + raise ParseError() + + return ch + +def parse_fuzzy_compare(source): + "Parses a cost comparator." + if source.match("<="): + return True + elif source.match("<"): + return False + else: + return None + +def parse_cost_equation(source, constraints): + "Parses a cost equation." + if "cost" in constraints: + raise error("more than one cost equation", source.string, source.pos) + + cost = {} + + parse_cost_term(source, cost) + while source.match("+"): + parse_cost_term(source, cost) + + max_inc = parse_fuzzy_compare(source) + if max_inc is None: + raise ParseError() + + max_cost = int(parse_count(source)) + + if not max_inc: + max_cost -= 1 + + if max_cost < 0: + raise error("bad fuzzy cost limit", source.string, source.pos) + + cost["max"] = max_cost + + constraints["cost"] = cost + +def parse_cost_term(source, cost): + "Parses a cost equation term." + coeff = parse_count(source) + ch = source.get() + if ch not in "dis": + raise ParseError() + + if ch in cost: + raise error("repeated fuzzy cost", source.string, source.pos) + + cost[ch] = int(coeff or 1) + +def parse_fuzzy_test(source, info, case_flags): + saved_pos = source.pos + ch = source.get() + if ch in SPECIAL_CHARS: + if ch == "\\": + # An escape sequence outside a set. + return parse_escape(source, info, False) + elif ch == ".": + # Any character. + if info.flags & DOTALL: + return AnyAll() + elif info.flags & WORD: + return AnyU() + else: + return Any() + elif ch == "[": + # A character set. + return parse_set(source, info) + else: + raise error("expected character set", source.string, saved_pos) + elif ch: + # A literal. + return Character(ord(ch), case_flags=case_flags) + else: + raise error("expected character set", source.string, saved_pos) + +def parse_count(source): + "Parses a quantifier's count, which can be empty." + return source.get_while(DIGITS) + +def parse_paren(source, info): + """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an + inline flag. + """ + saved_pos = source.pos + ch = source.get(True) + if ch == "?": + # (?... + saved_pos_2 = source.pos + ch = source.get(True) + if ch == "<": + # (?<... + saved_pos_3 = source.pos + ch = source.get() + if ch in ("=", "!"): + # (?<=... or (?") + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + info.close_group() + return Group(info, group, subpattern) + if ch in ("=", "!"): + # (?=... or (?!...: lookahead. + return parse_lookaround(source, info, False, ch == "=") + if ch == "P": + # (?P...: a Python extension. + return parse_extension(source, info) + if ch == "#": + # (?#...: a comment. + return parse_comment(source) + if ch == "(": + # (?(...: a conditional subpattern. + return parse_conditional(source, info) + if ch == ">": + # (?>...: an atomic subpattern. + return parse_atomic(source, info) + if ch == "|": + # (?|...: a common/reset groups branch. + return parse_common(source, info) + if ch == "R" or "0" <= ch <= "9": + # (?R...: probably a call to a group. + return parse_call_group(source, info, ch, saved_pos_2) + if ch == "&": + # (?&...: a call to a named group. + return parse_call_named_group(source, info, saved_pos_2) + + # (?...: probably a flags subpattern. + source.pos = saved_pos_2 + return parse_flags_subpattern(source, info) + + if ch == "*": + # (*... + saved_pos_2 = source.pos + word = source.get_while(set(")>"), include=False) + if word[ : 1].isalpha(): + verb = VERBS.get(word) + if not verb: + raise error("unknown verb", source.string, saved_pos_2) + + source.expect(")") + + return verb + + # (...: an unnamed capture group. + source.pos = saved_pos + group = info.open_group() + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + info.close_group() + + return Group(info, group, subpattern) + +def parse_extension(source, info): + "Parses a Python extension." + saved_pos = source.pos + ch = source.get() + if ch == "<": + # (?P<...: a named capture group. + name = parse_name(source) + group = info.open_group(name) + source.expect(">") + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + info.close_group() + + return Group(info, group, subpattern) + if ch == "=": + # (?P=...: a named group reference. + name = parse_name(source, allow_numeric=True) + source.expect(")") + if info.is_open_group(name): + raise error("cannot refer to an open group", source.string, + saved_pos) + + return make_ref_group(info, name, saved_pos) + if ch == ">" or ch == "&": + # (?P>...: a call to a group. + return parse_call_named_group(source, info, saved_pos) + + source.pos = saved_pos + raise error("unknown extension", source.string, saved_pos) + +def parse_comment(source): + "Parses a comment." + while True: + saved_pos = source.pos + c = source.get(True) + + if not c or c == ")": + break + + if c == "\\": + c = source.get(True) + + source.pos = saved_pos + source.expect(")") + + return None + +def parse_lookaround(source, info, behind, positive): + "Parses a lookaround." + saved_flags = info.flags + try: + subpattern = _parse_pattern(source, info) + source.expect(")") + finally: + info.flags = saved_flags + source.ignore_space = bool(info.flags & VERBOSE) + + return LookAround(behind, positive, subpattern) + +def parse_conditional(source, info): + "Parses a conditional subpattern." + saved_flags = info.flags + saved_pos = source.pos + ch = source.get() + if ch == "?": + # (?(?... + ch = source.get() + if ch in ("=", "!"): + # (?(?=... or (?(?!...: lookahead conditional. + return parse_lookaround_conditional(source, info, False, ch == "=") + if ch == "<": + # (?(?<... + ch = source.get() + if ch in ("=", "!"): + # (?(?<=... or (?(?"), include=False) + + if not name: + raise error("missing group name", source.string, source.pos) + + if name.isdigit(): + min_group = 0 if allow_group_0 else 1 + if not allow_numeric or int(name) < min_group: + raise error("bad character in group name", source.string, + source.pos) + else: + if not name.isidentifier(): + raise error("bad character in group name", source.string, + source.pos) + + return name + +def is_octal(string): + "Checks whether a string is octal." + return all(ch in OCT_DIGITS for ch in string) + +def is_decimal(string): + "Checks whether a string is decimal." + return all(ch in DIGITS for ch in string) + +def is_hexadecimal(string): + "Checks whether a string is hexadecimal." + return all(ch in HEX_DIGITS for ch in string) + +def parse_escape(source, info, in_set): + "Parses an escape sequence." + saved_ignore = source.ignore_space + source.ignore_space = False + ch = source.get() + source.ignore_space = saved_ignore + if not ch: + # A backslash at the end of the pattern. + raise error("bad escape (end of pattern)", source.string, source.pos) + if ch in HEX_ESCAPES: + # A hexadecimal escape sequence. + return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch) + elif ch == "g" and not in_set: + # A group reference. + saved_pos = source.pos + try: + return parse_group_ref(source, info) + except error: + # Invalid as a group reference, so assume it's a literal. + source.pos = saved_pos + + return make_character(info, ord(ch), in_set) + elif ch == "G" and not in_set: + # A search anchor. + return SearchAnchor() + elif ch == "L" and not in_set: + # A string set. + return parse_string_set(source, info) + elif ch == "N": + # A named codepoint. + return parse_named_char(source, info, in_set) + elif ch in "pP": + # A Unicode property, positive or negative. + return parse_property(source, info, ch == "p", in_set) + elif ch == "R" and not in_set: + # A line ending. + charset = [0x0A, 0x0B, 0x0C, 0x0D] + if info.guess_encoding == UNICODE: + charset.extend([0x85, 0x2028, 0x2029]) + + return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c) + for c in charset])])) + elif ch == "X" and not in_set: + # A grapheme cluster. + return Grapheme() + elif ch in ALPHA: + # An alphabetic escape sequence. + # Positional escapes aren't allowed inside a character set. + if not in_set: + if info.flags & WORD: + value = WORD_POSITION_ESCAPES.get(ch) + else: + value = POSITION_ESCAPES.get(ch) + + if value: + return value + + value = CHARSET_ESCAPES.get(ch) + if value: + return value + + value = CHARACTER_ESCAPES.get(ch) + if value: + return Character(ord(value)) + + raise error("bad escape \\%s" % ch, source.string, source.pos) + elif ch in DIGITS: + # A numeric escape sequence. + return parse_numeric_escape(source, info, ch, in_set) + else: + # A literal. + return make_character(info, ord(ch), in_set) + +def parse_numeric_escape(source, info, ch, in_set): + "Parses a numeric escape sequence." + if in_set or ch == "0": + # Octal escape sequence, max 3 digits. + return parse_octal_escape(source, info, [ch], in_set) + + # At least 1 digit, so either octal escape or group. + digits = ch + saved_pos = source.pos + ch = source.get() + if ch in DIGITS: + # At least 2 digits, so either octal escape or group. + digits += ch + saved_pos = source.pos + ch = source.get() + if is_octal(digits) and ch in OCT_DIGITS: + # 3 octal digits, so octal escape sequence. + encoding = info.flags & _ALL_ENCODINGS + if encoding == ASCII or encoding == LOCALE: + octal_mask = 0xFF + else: + octal_mask = 0x1FF + + value = int(digits + ch, 8) & octal_mask + return make_character(info, value) + + # Group reference. + source.pos = saved_pos + if info.is_open_group(digits): + raise error("cannot refer to an open group", source.string, source.pos) + + return make_ref_group(info, digits, source.pos) + +def parse_octal_escape(source, info, digits, in_set): + "Parses an octal escape sequence." + saved_pos = source.pos + ch = source.get() + while len(digits) < 3 and ch in OCT_DIGITS: + digits.append(ch) + saved_pos = source.pos + ch = source.get() + + source.pos = saved_pos + try: + value = int("".join(digits), 8) + return make_character(info, value, in_set) + except ValueError: + if digits[0] in OCT_DIGITS: + raise error("incomplete escape \\%s" % ''.join(digits), + source.string, source.pos) + else: + raise error("bad escape \\%s" % digits[0], source.string, + source.pos) + +def parse_hex_escape(source, info, esc, expected_len, in_set, type): + "Parses a hex escape sequence." + saved_pos = source.pos + digits = [] + for i in range(expected_len): + ch = source.get() + if ch not in HEX_DIGITS: + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), + source.string, saved_pos) + digits.append(ch) + + try: + value = int("".join(digits), 16) + except ValueError: + pass + else: + if value < 0x110000: + return make_character(info, value, in_set) + + # Bad hex escape. + raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)), + source.string, saved_pos) + +def parse_group_ref(source, info): + "Parses a group reference." + source.expect("<") + saved_pos = source.pos + name = parse_name(source, True) + source.expect(">") + if info.is_open_group(name): + raise error("cannot refer to an open group", source.string, source.pos) + + return make_ref_group(info, name, saved_pos) + +def parse_string_set(source, info): + "Parses a string set reference." + source.expect("<") + name = parse_name(source, True) + source.expect(">") + if name is None or name not in info.kwargs: + raise error("undefined named list", source.string, source.pos) + + return make_string_set(info, name) + +def parse_named_char(source, info, in_set): + "Parses a named character." + saved_pos = source.pos + if source.match("{"): + name = source.get_while(NAMED_CHAR_PART, keep_spaces=True) + if source.match("}"): + try: + value = unicodedata.lookup(name) + return make_character(info, ord(value), in_set) + except KeyError: + raise error("undefined character name", source.string, + source.pos) + + source.pos = saved_pos + return make_character(info, ord("N"), in_set) + +def parse_property(source, info, positive, in_set): + "Parses a Unicode property." + saved_pos = source.pos + ch = source.get() + if ch == "{": + negate = source.match("^") + prop_name, name = parse_property_name(source) + if source.match("}"): + # It's correctly delimited. + prop = lookup_property(prop_name, name, positive != negate, source) + return make_property(info, prop, in_set) + elif ch and ch in "CLMNPSZ": + # An abbreviated property, eg \pL. + prop = lookup_property(None, ch, positive, source) + return make_property(info, prop, in_set) + + # Not a property, so treat as a literal "p" or "P". + source.pos = saved_pos + ch = "p" if positive else "P" + return make_character(info, ord(ch), in_set) + +def parse_property_name(source): + "Parses a property name, which may be qualified." + name = source.get_while(PROPERTY_NAME_PART) + saved_pos = source.pos + + ch = source.get() + if ch and ch in ":=": + prop_name = name + name = source.get_while(ALNUM | set(" &_-./")).strip() + + if name: + # Name after the ":" or "=", so it's a qualified name. + saved_pos = source.pos + else: + # No name after the ":" or "=", so assume it's an unqualified name. + prop_name, name = None, prop_name + else: + prop_name = None + + source.pos = saved_pos + return prop_name, name + +def parse_set(source, info): + "Parses a character set." + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + saved_ignore = source.ignore_space + source.ignore_space = False + # Negative set? + negate = source.match("^") + try: + if version == VERSION0: + item = parse_set_imp_union(source, info) + else: + item = parse_set_union(source, info) + + if not source.match("]"): + raise error("missing ]", source.string, source.pos) + finally: + source.ignore_space = saved_ignore + + if negate: + item = item.with_flags(positive=not item.positive) + + item = item.with_flags(case_flags=make_case_flags(info)) + + return item + +def parse_set_union(source, info): + "Parses a set union ([x||y])." + items = [parse_set_symm_diff(source, info)] + while source.match("||"): + items.append(parse_set_symm_diff(source, info)) + + if len(items) == 1: + return items[0] + return SetUnion(info, items) + +def parse_set_symm_diff(source, info): + "Parses a set symmetric difference ([x~~y])." + items = [parse_set_inter(source, info)] + while source.match("~~"): + items.append(parse_set_inter(source, info)) + + if len(items) == 1: + return items[0] + return SetSymDiff(info, items) + +def parse_set_inter(source, info): + "Parses a set intersection ([x&&y])." + items = [parse_set_diff(source, info)] + while source.match("&&"): + items.append(parse_set_diff(source, info)) + + if len(items) == 1: + return items[0] + return SetInter(info, items) + +def parse_set_diff(source, info): + "Parses a set difference ([x--y])." + items = [parse_set_imp_union(source, info)] + while source.match("--"): + items.append(parse_set_imp_union(source, info)) + + if len(items) == 1: + return items[0] + return SetDiff(info, items) + +def parse_set_imp_union(source, info): + "Parses a set implicit union ([xy])." + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + items = [parse_set_member(source, info)] + while True: + saved_pos = source.pos + if source.match("]"): + # End of the set. + source.pos = saved_pos + break + + if version == VERSION1 and any(source.match(op) for op in SET_OPS): + # The new behaviour has set operators. + source.pos = saved_pos + break + + items.append(parse_set_member(source, info)) + + if len(items) == 1: + return items[0] + return SetUnion(info, items) + +def parse_set_member(source, info): + "Parses a member in a character set." + # Parse a set item. + start = parse_set_item(source, info) + saved_pos1 = source.pos + if (not isinstance(start, Character) or not start.positive or not + source.match("-")): + # It's not the start of a range. + return start + + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + # It looks like the start of a range of characters. + saved_pos2 = source.pos + if version == VERSION1 and source.match("-"): + # It's actually the set difference operator '--', so return the + # character. + source.pos = saved_pos1 + return start + + if source.match("]"): + # We've reached the end of the set, so return both the character and + # hyphen. + source.pos = saved_pos2 + return SetUnion(info, [start, Character(ord("-"))]) + + # Parse a set item. + end = parse_set_item(source, info) + if not isinstance(end, Character) or not end.positive: + # It's not a range, so return the character, hyphen and property. + return SetUnion(info, [start, Character(ord("-")), end]) + + # It _is_ a range. + if start.value > end.value: + raise error("bad character range", source.string, source.pos) + + if start.value == end.value: + return start + + return Range(start.value, end.value) + +def parse_set_item(source, info): + "Parses an item in a character set." + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + + if source.match("\\"): + # An escape sequence in a set. + return parse_escape(source, info, True) + + saved_pos = source.pos + if source.match("[:"): + # Looks like a POSIX character class. + try: + return parse_posix_class(source, info) + except ParseError: + # Not a POSIX character class. + source.pos = saved_pos + + if version == VERSION1 and source.match("["): + # It's the start of a nested set. + + # Negative set? + negate = source.match("^") + item = parse_set_union(source, info) + + if not source.match("]"): + raise error("missing ]", source.string, source.pos) + + if negate: + item = item.with_flags(positive=not item.positive) + + return item + + ch = source.get() + if not ch: + raise error("unterminated character set", source.string, source.pos) + + return Character(ord(ch)) + +def parse_posix_class(source, info): + "Parses a POSIX character class." + negate = source.match("^") + prop_name, name = parse_property_name(source) + if not source.match(":]"): + raise ParseError() + + return lookup_property(prop_name, name, not negate, source, posix=True) + +def float_to_rational(flt): + "Converts a float to a rational pair." + int_part = int(flt) + error = flt - int_part + if abs(error) < 0.0001: + return int_part, 1 + + den, num = float_to_rational(1.0 / error) + + return int_part * den + num, den + +def numeric_to_rational(numeric): + "Converts a numeric string to a rational string, if possible." + if numeric[ : 1] == "-": + sign, numeric = numeric[0], numeric[1 : ] + else: + sign = "" + + parts = numeric.split("/") + if len(parts) == 2: + num, den = float_to_rational(float(parts[0]) / float(parts[1])) + elif len(parts) == 1: + num, den = float_to_rational(float(parts[0])) + else: + raise ValueError() + + result = "{}{}/{}".format(sign, num, den) + if result.endswith("/1"): + return result[ : -2] + + return result + +def standardise_name(name): + "Standardises a property or value name." + try: + return numeric_to_rational("".join(name)) + except (ValueError, ZeroDivisionError): + return "".join(ch for ch in name if ch not in "_- ").upper() + +_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split()) + +_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split()) + +def lookup_property(property, value, positive, source=None, posix=False): + "Looks up a property." + # Normalise the names (which may still be lists). + property = standardise_name(property) if property else None + value = standardise_name(value) + + if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): + property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive + + if posix and not property and value.upper() in _POSIX_CLASSES: + value = 'POSIX' + value + + if property: + # Both the property and the value are provided. + prop = PROPERTIES.get(property) + if not prop: + if not source: + raise error("unknown property") + + raise error("unknown property", source.string, source.pos) + + prop_id, value_dict = prop + val_id = value_dict.get(value) + if val_id is None: + if not source: + raise error("unknown property value") + + raise error("unknown property value", source.string, source.pos) + + return Property((prop_id << 16) | val_id, positive) + + # Only the value is provided. + # It might be the name of a GC, script or block value. + for property in ("GC", "SCRIPT", "BLOCK"): + prop_id, value_dict = PROPERTIES.get(property) + val_id = value_dict.get(value) + if val_id is not None: + return Property((prop_id << 16) | val_id, positive) + + # It might be the name of a binary property. + prop = PROPERTIES.get(value) + if prop: + prop_id, value_dict = prop + if set(value_dict) == _BINARY_VALUES: + return Property((prop_id << 16) | 1, positive) + + return Property(prop_id << 16, not positive) + + # It might be the name of a binary property starting with a prefix. + if value.startswith("IS"): + prop = PROPERTIES.get(value[2 : ]) + if prop: + prop_id, value_dict = prop + if "YES" in value_dict: + return Property((prop_id << 16) | 1, positive) + + # It might be the name of a script or block starting with a prefix. + for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): + if value.startswith(prefix): + prop_id, value_dict = PROPERTIES.get(property) + val_id = value_dict.get(value[2 : ]) + if val_id is not None: + return Property((prop_id << 16) | val_id, positive) + + # Unknown property. + if not source: + raise error("unknown property") + + raise error("unknown property", source.string, source.pos) + +def _compile_replacement(source, pattern, is_unicode): + "Compiles a replacement template escape sequence." + ch = source.get() + if ch in ALPHA: + # An alphabetic escape sequence. + value = CHARACTER_ESCAPES.get(ch) + if value: + return False, [ord(value)] + + if ch in HEX_ESCAPES and (ch == "x" or is_unicode): + # A hexadecimal escape sequence. + return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] + + if ch == "g": + # A group preference. + return True, [compile_repl_group(source, pattern)] + + if ch == "N" and is_unicode: + # A named character. + value = parse_repl_named_char(source) + if value is not None: + return False, [value] + + raise error("bad escape \\%s" % ch, source.string, source.pos) + + if isinstance(source.sep, bytes): + octal_mask = 0xFF + else: + octal_mask = 0x1FF + + if ch == "0": + # An octal escape sequence. + digits = ch + while len(digits) < 3: + saved_pos = source.pos + ch = source.get() + if ch not in OCT_DIGITS: + source.pos = saved_pos + break + digits += ch + + return False, [int(digits, 8) & octal_mask] + + if ch in DIGITS: + # Either an octal escape sequence (3 digits) or a group reference (max + # 2 digits). + digits = ch + saved_pos = source.pos + ch = source.get() + if ch in DIGITS: + digits += ch + saved_pos = source.pos + ch = source.get() + if ch and is_octal(digits + ch): + # An octal escape sequence. + return False, [int(digits + ch, 8) & octal_mask] + + # A group reference. + source.pos = saved_pos + return True, [int(digits)] + + if ch == "\\": + # An escaped backslash is a backslash. + return False, [ord("\\")] + + if not ch: + # A trailing backslash. + raise error("bad escape (end of pattern)", source.string, source.pos) + + # An escaped non-backslash is a backslash followed by the literal. + return False, [ord("\\"), ord(ch)] + +def parse_repl_hex_escape(source, expected_len, type): + "Parses a hex escape sequence in a replacement string." + digits = [] + for i in range(expected_len): + ch = source.get() + if ch not in HEX_DIGITS: + raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), + source.string, source.pos) + digits.append(ch) + + return int("".join(digits), 16) + +def parse_repl_named_char(source): + "Parses a named character in a replacement string." + saved_pos = source.pos + if source.match("{"): + name = source.get_while(ALPHA | set(" ")) + + if source.match("}"): + try: + value = unicodedata.lookup(name) + return ord(value) + except KeyError: + raise error("undefined character name", source.string, + source.pos) + + source.pos = saved_pos + return None + +def compile_repl_group(source, pattern): + "Compiles a replacement template group reference." + source.expect("<") + name = parse_name(source, True, True) + + source.expect(">") + if name.isdigit(): + index = int(name) + if not 0 <= index <= pattern.groups: + raise error("invalid group reference", source.string, source.pos) + + return index + + try: + return pattern.groupindex[name] + except KeyError: + raise IndexError("unknown group") + +# The regular expression is parsed into a syntax tree. The different types of +# node are defined below. + +INDENT = " " +POSITIVE_OP = 0x1 +ZEROWIDTH_OP = 0x2 +FUZZY_OP = 0x4 +REVERSE_OP = 0x8 +REQUIRED_OP = 0x10 + +POS_TEXT = {False: "NON-MATCH", True: "MATCH"} +CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", + FULLIGNORECASE: " FULL_IGNORE_CASE"} + +def make_sequence(items): + if len(items) == 1: + return items[0] + return Sequence(items) + +# Common base class for all nodes. +class RegexBase: + def __init__(self): + self._key = self.__class__ + + def with_flags(self, positive=None, case_flags=None, zerowidth=None): + if positive is None: + positive = self.positive + else: + positive = bool(positive) + if case_flags is None: + case_flags = self.case_flags + else: + case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS] + if zerowidth is None: + zerowidth = self.zerowidth + else: + zerowidth = bool(zerowidth) + + if (positive == self.positive and case_flags == self.case_flags and + zerowidth == self.zerowidth): + return self + + return self.rebuild(positive, case_flags, zerowidth) + + def fix_groups(self, pattern, reverse, fuzzy): + pass + + def optimise(self, info, reverse): + return self + + def pack_characters(self, info): + return self + + def remove_captures(self): + return self + + def is_atomic(self): + return True + + def can_be_affix(self): + return True + + def contains_group(self): + return False + + def get_firstset(self, reverse): + raise _FirstSetError() + + def has_simple_start(self): + return False + + def compile(self, reverse=False, fuzzy=False): + return self._compile(reverse, fuzzy) + + def is_empty(self): + return False + + def __hash__(self): + return hash(self._key) + + def __eq__(self, other): + return type(self) is type(other) and self._key == other._key + + def __ne__(self, other): + return not self.__eq__(other) + + def get_required_string(self, reverse): + return self.max_width(), None + +# Base class for zero-width nodes. +class ZeroWidthBase(RegexBase): + def __init__(self, positive=True): + RegexBase.__init__(self) + self.positive = bool(positive) + + self._key = self.__class__, self.positive + + def get_firstset(self, reverse): + return set([None]) + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if fuzzy: + flags |= FUZZY_OP + if reverse: + flags |= REVERSE_OP + return [(self._opcode, flags)] + + def dump(self, indent, reverse): + print("{}{} {}".format(INDENT * indent, self._op_name, + POS_TEXT[self.positive])) + + def max_width(self): + return 0 + +class Any(RegexBase): + _opcode = {False: OP.ANY, True: OP.ANY_REV} + _op_name = "ANY" + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[reverse], flags)] + + def dump(self, indent, reverse): + print("{}{}".format(INDENT * indent, self._op_name)) + + def max_width(self): + return 1 + +class AnyAll(Any): + _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} + _op_name = "ANY_ALL" + +class AnyU(Any): + _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} + _op_name = "ANY_U" + +class Atomic(RegexBase): + def __init__(self, subpattern): + RegexBase.__init__(self) + self.subpattern = subpattern + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info, reverse): + self.subpattern = self.subpattern.optimise(info, reverse) + + if self.subpattern.is_empty(): + return self.subpattern + return self + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + return self + + def can_be_affix(self): + return self.subpattern.can_be_affix() + + def contains_group(self): + return self.subpattern.contains_group() + + def get_firstset(self, reverse): + return self.subpattern.get_firstset(reverse) + + def has_simple_start(self): + return self.subpattern.has_simple_start() + + def _compile(self, reverse, fuzzy): + return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + + [(OP.END, )]) + + def dump(self, indent, reverse): + print("{}ATOMIC".format(INDENT * indent)) + self.subpattern.dump(indent + 1, reverse) + + def is_empty(self): + return self.subpattern.is_empty() + + def __eq__(self, other): + return (type(self) is type(other) and self.subpattern == + other.subpattern) + + def max_width(self): + return self.subpattern.max_width() + + def get_required_string(self, reverse): + return self.subpattern.get_required_string(reverse) + +class Boundary(ZeroWidthBase): + _opcode = OP.BOUNDARY + _op_name = "BOUNDARY" + +class Branch(RegexBase): + def __init__(self, branches): + RegexBase.__init__(self) + self.branches = branches + + def fix_groups(self, pattern, reverse, fuzzy): + for b in self.branches: + b.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info, reverse): + if not self.branches: + return Sequence([]) + + # Flatten branches within branches. + branches = Branch._flatten_branches(info, reverse, self.branches) + + # Move any common prefix or suffix out of the branches. + if reverse: + suffix, branches = Branch._split_common_suffix(info, branches) + prefix = [] + else: + prefix, branches = Branch._split_common_prefix(info, branches) + suffix = [] + + # Try to reduce adjacent single-character branches to sets. + branches = Branch._reduce_to_set(info, reverse, branches) + + if len(branches) > 1: + sequence = [Branch(branches)] + + if not prefix or not suffix: + # We might be able to add a quick precheck before the branches. + firstset = self._add_precheck(info, reverse, branches) + + if firstset: + if reverse: + sequence.append(firstset) + else: + sequence.insert(0, firstset) + else: + sequence = branches + + return make_sequence(prefix + sequence + suffix) + + def _add_precheck(self, info, reverse, branches): + charset = set() + pos = -1 if reverse else 0 + + for branch in branches: + if type(branch) is Literal and branch.case_flags == NOCASE: + charset.add(branch.characters[pos]) + else: + return + + if not charset: + return None + + return _check_firstset(info, reverse, [Character(c) for c in charset]) + + def pack_characters(self, info): + self.branches = [b.pack_characters(info) for b in self.branches] + return self + + def remove_captures(self): + self.branches = [b.remove_captures() for b in self.branches] + return self + + def is_atomic(self): + return all(b.is_atomic() for b in self.branches) + + def can_be_affix(self): + return all(b.can_be_affix() for b in self.branches) + + def contains_group(self): + return any(b.contains_group() for b in self.branches) + + def get_firstset(self, reverse): + fs = set() + for b in self.branches: + fs |= b.get_firstset(reverse) + + return fs or set([None]) + + def _compile(self, reverse, fuzzy): + if not self.branches: + return [] + + code = [(OP.BRANCH, )] + for b in self.branches: + code.extend(b.compile(reverse, fuzzy)) + code.append((OP.NEXT, )) + + code[-1] = (OP.END, ) + + return code + + def dump(self, indent, reverse): + print("{}BRANCH".format(INDENT * indent)) + self.branches[0].dump(indent + 1, reverse) + for b in self.branches[1 : ]: + print("{}OR".format(INDENT * indent)) + b.dump(indent + 1, reverse) + + @staticmethod + def _flatten_branches(info, reverse, branches): + # Flatten the branches so that there aren't branches of branches. + new_branches = [] + for b in branches: + b = b.optimise(info, reverse) + if isinstance(b, Branch): + new_branches.extend(b.branches) + else: + new_branches.append(b) + + return new_branches + + @staticmethod + def _split_common_prefix(info, branches): + # Common leading items can be moved out of the branches. + # Get the items in the branches. + alternatives = [] + for b in branches: + if isinstance(b, Sequence): + alternatives.append(b.items) + else: + alternatives.append([b]) + + # What is the maximum possible length of the prefix? + max_count = min(len(a) for a in alternatives) + + # What is the longest common prefix? + prefix = alternatives[0] + pos = 0 + end_pos = max_count + while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] == + prefix[pos] for a in alternatives): + pos += 1 + count = pos + + if info.flags & UNICODE: + # We need to check that we're not splitting a sequence of + # characters which could form part of full case-folding. + count = pos + while count > 0 and not all(Branch._can_split(a, count) for a in + alternatives): + count -= 1 + + # No common prefix is possible. + if count == 0: + return [], branches + + # Rebuild the branches. + new_branches = [] + for a in alternatives: + new_branches.append(make_sequence(a[count : ])) + + return prefix[ : count], new_branches + + @staticmethod + def _split_common_suffix(info, branches): + # Common trailing items can be moved out of the branches. + # Get the items in the branches. + alternatives = [] + for b in branches: + if isinstance(b, Sequence): + alternatives.append(b.items) + else: + alternatives.append([b]) + + # What is the maximum possible length of the suffix? + max_count = min(len(a) for a in alternatives) + + # What is the longest common suffix? + suffix = alternatives[0] + pos = -1 + end_pos = -1 - max_count + while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] == + suffix[pos] for a in alternatives): + pos -= 1 + count = -1 - pos + + if info.flags & UNICODE: + # We need to check that we're not splitting a sequence of + # characters which could form part of full case-folding. + while count > 0 and not all(Branch._can_split_rev(a, count) for a + in alternatives): + count -= 1 + + # No common suffix is possible. + if count == 0: + return [], branches + + # Rebuild the branches. + new_branches = [] + for a in alternatives: + new_branches.append(make_sequence(a[ : -count])) + + return suffix[-count : ], new_branches + + @staticmethod + def _can_split(items, count): + # Check the characters either side of the proposed split. + if not Branch._is_full_case(items, count - 1): + return True + + if not Branch._is_full_case(items, count): + return True + + # Check whether a 1-1 split would be OK. + if Branch._is_folded(items[count - 1 : count + 1]): + return False + + # Check whether a 1-2 split would be OK. + if (Branch._is_full_case(items, count + 2) and + Branch._is_folded(items[count - 1 : count + 2])): + return False + + # Check whether a 2-1 split would be OK. + if (Branch._is_full_case(items, count - 2) and + Branch._is_folded(items[count - 2 : count + 1])): + return False + + return True + + @staticmethod + def _can_split_rev(items, count): + end = len(items) + + # Check the characters either side of the proposed split. + if not Branch._is_full_case(items, end - count): + return True + + if not Branch._is_full_case(items, end - count - 1): + return True + + # Check whether a 1-1 split would be OK. + if Branch._is_folded(items[end - count - 1 : end - count + 1]): + return False + + # Check whether a 1-2 split would be OK. + if (Branch._is_full_case(items, end - count + 2) and + Branch._is_folded(items[end - count - 1 : end - count + 2])): + return False + + # Check whether a 2-1 split would be OK. + if (Branch._is_full_case(items, end - count - 2) and + Branch._is_folded(items[end - count - 2 : end - count + 1])): + return False + + return True + + @staticmethod + def _merge_common_prefixes(info, reverse, branches): + # Branches with the same case-sensitive character prefix can be grouped + # together if they are separated only by other branches with a + # character prefix. + prefixed = defaultdict(list) + order = {} + new_branches = [] + for b in branches: + if Branch._is_simple_character(b): + # Branch starts with a simple character. + prefixed[b.value].append([b]) + order.setdefault(b.value, len(order)) + elif (isinstance(b, Sequence) and b.items and + Branch._is_simple_character(b.items[0])): + # Branch starts with a simple character. + prefixed[b.items[0].value].append(b.items) + order.setdefault(b.items[0].value, len(order)) + else: + Branch._flush_char_prefix(info, reverse, prefixed, order, + new_branches) + + new_branches.append(b) + + Branch._flush_char_prefix(info, prefixed, order, new_branches) + + return new_branches + + @staticmethod + def _is_simple_character(c): + return isinstance(c, Character) and c.positive and not c.case_flags + + @staticmethod + def _reduce_to_set(info, reverse, branches): + # Can the branches be reduced to a set? + new_branches = [] + items = set() + case_flags = NOCASE + for b in branches: + if isinstance(b, (Character, Property, SetBase)): + # Branch starts with a single character. + if b.case_flags != case_flags: + # Different case sensitivity, so flush. + Branch._flush_set_members(info, reverse, items, case_flags, + new_branches) + + case_flags = b.case_flags + + items.add(b.with_flags(case_flags=NOCASE)) + else: + Branch._flush_set_members(info, reverse, items, case_flags, + new_branches) + + new_branches.append(b) + + Branch._flush_set_members(info, reverse, items, case_flags, + new_branches) + + return new_branches + + @staticmethod + def _flush_char_prefix(info, reverse, prefixed, order, new_branches): + # Flush the prefixed branches. + if not prefixed: + return + + for value, branches in sorted(prefixed.items(), key=lambda pair: + order[pair[0]]): + if len(branches) == 1: + new_branches.append(make_sequence(branches[0])) + else: + subbranches = [] + optional = False + for b in branches: + if len(b) > 1: + subbranches.append(make_sequence(b[1 : ])) + elif not optional: + subbranches.append(Sequence()) + optional = True + + sequence = Sequence([Character(value), Branch(subbranches)]) + new_branches.append(sequence.optimise(info, reverse)) + + prefixed.clear() + order.clear() + + @staticmethod + def _flush_set_members(info, reverse, items, case_flags, new_branches): + # Flush the set members. + if not items: + return + + if len(items) == 1: + item = list(items)[0] + else: + item = SetUnion(info, list(items)).optimise(info, reverse) + + new_branches.append(item.with_flags(case_flags=case_flags)) + + items.clear() + + @staticmethod + def _is_full_case(items, i): + if not 0 <= i < len(items): + return False + + item = items[i] + return (isinstance(item, Character) and item.positive and + (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE) + + @staticmethod + def _is_folded(items): + if len(items) < 2: + return False + + for i in items: + if (not isinstance(i, Character) or not i.positive or not + i.case_flags): + return False + + folded = "".join(chr(i.value) for i in items) + folded = _regex.fold_case(FULL_CASE_FOLDING, folded) + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + for c in expanding_chars: + if folded == _regex.fold_case(FULL_CASE_FOLDING, c): + return True + + return False + + def is_empty(self): + return all(b.is_empty() for b in self.branches) + + def __eq__(self, other): + return type(self) is type(other) and self.branches == other.branches + + def max_width(self): + return max(b.max_width() for b in self.branches) + +class CallGroup(RegexBase): + def __init__(self, info, group, position): + RegexBase.__init__(self) + self.info = info + self.group = group + self.position = position + + self._key = self.__class__, self.group + + def fix_groups(self, pattern, reverse, fuzzy): + try: + self.group = int(self.group) + except ValueError: + try: + self.group = self.info.group_index[self.group] + except KeyError: + raise error("invalid group reference", pattern, self.position) + + if not 0 <= self.group <= self.info.group_count: + raise error("unknown group", pattern, self.position) + + if self.group > 0 and self.info.open_group_count[self.group] > 1: + raise error("ambiguous group reference", pattern, self.position) + + self.info.group_calls.append((self, reverse, fuzzy)) + + self._key = self.__class__, self.group + + def remove_captures(self): + raise error("group reference not allowed", pattern, self.position) + + def _compile(self, reverse, fuzzy): + return [(OP.GROUP_CALL, self.call_ref)] + + def dump(self, indent, reverse): + print("{}GROUP_CALL {}".format(INDENT * indent, self.group)) + + def __eq__(self, other): + return type(self) is type(other) and self.group == other.group + + def max_width(self): + return UNLIMITED + + def __del__(self): + self.info = None + +class CallRef(RegexBase): + def __init__(self, ref, parsed): + self.ref = ref + self.parsed = parsed + + def _compile(self, reverse, fuzzy): + return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse, + fuzzy) + [(OP.END, )]) + +class Character(RegexBase): + _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False): + OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE, + False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE, + True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV, + (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV} + + def __init__(self, value, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.value = value + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + if (self.positive and (self.case_flags & FULLIGNORECASE) == + FULLIGNORECASE): + self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value)) + else: + self.folded = chr(self.value) + + self._key = (self.__class__, self.value, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return Character(self.value, positive, case_flags, zerowidth) + + def optimise(self, info, reverse, in_set=False): + return self + + def get_firstset(self, reverse): + return set([self]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + + code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags, + self.value]) + + if len(self.folded) > 1: + # The character expands on full case-folding. + code = Branch([code, String([ord(c) for c in self.folded], + case_flags=self.case_flags)]) + + return code.compile(reverse, fuzzy) + + def dump(self, indent, reverse): + display = ascii(chr(self.value)).lstrip("bu") + print("{}CHARACTER {} {}{}".format(INDENT * indent, + POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])) + + def matches(self, ch): + return (ch == self.value) == self.positive + + def max_width(self): + return len(self.folded) + + def get_required_string(self, reverse): + if not self.positive: + return 1, None + + self.folded_characters = tuple(ord(c) for c in self.folded) + + return 0, self + +class Conditional(RegexBase): + def __init__(self, info, group, yes_item, no_item, position): + RegexBase.__init__(self) + self.info = info + self.group = group + self.yes_item = yes_item + self.no_item = no_item + self.position = position + + def fix_groups(self, pattern, reverse, fuzzy): + try: + self.group = int(self.group) + except ValueError: + try: + self.group = self.info.group_index[self.group] + except KeyError: + if self.group == 'DEFINE': + # 'DEFINE' is a special name unless there's a group with + # that name. + self.group = 0 + else: + raise error("unknown group", pattern, self.position) + + if not 0 <= self.group <= self.info.group_count: + raise error("invalid group reference", pattern, self.position) + + self.yes_item.fix_groups(pattern, reverse, fuzzy) + self.no_item.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info, reverse): + yes_item = self.yes_item.optimise(info, reverse) + no_item = self.no_item.optimise(info, reverse) + + return Conditional(info, self.group, yes_item, no_item, self.position) + + def pack_characters(self, info): + self.yes_item = self.yes_item.pack_characters(info) + self.no_item = self.no_item.pack_characters(info) + return self + + def remove_captures(self): + self.yes_item = self.yes_item.remove_captures() + self.no_item = self.no_item.remove_captures() + + def is_atomic(self): + return self.yes_item.is_atomic() and self.no_item.is_atomic() + + def can_be_affix(self): + return self.yes_item.can_be_affix() and self.no_item.can_be_affix() + + def contains_group(self): + return self.yes_item.contains_group() or self.no_item.contains_group() + + def get_firstset(self, reverse): + return (self.yes_item.get_firstset(reverse) | + self.no_item.get_firstset(reverse)) + + def _compile(self, reverse, fuzzy): + code = [(OP.GROUP_EXISTS, self.group)] + code.extend(self.yes_item.compile(reverse, fuzzy)) + add_code = self.no_item.compile(reverse, fuzzy) + if add_code: + code.append((OP.NEXT, )) + code.extend(add_code) + + code.append((OP.END, )) + + return code + + def dump(self, indent, reverse): + print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group)) + self.yes_item.dump(indent + 1, reverse) + if not self.no_item.is_empty(): + print("{}OR".format(INDENT * indent)) + self.no_item.dump(indent + 1, reverse) + + def is_empty(self): + return self.yes_item.is_empty() and self.no_item.is_empty() + + def __eq__(self, other): + return type(self) is type(other) and (self.group, self.yes_item, + self.no_item) == (other.group, other.yes_item, other.no_item) + + def max_width(self): + return max(self.yes_item.max_width(), self.no_item.max_width()) + + def __del__(self): + self.info = None + +class DefaultBoundary(ZeroWidthBase): + _opcode = OP.DEFAULT_BOUNDARY + _op_name = "DEFAULT_BOUNDARY" + +class DefaultEndOfWord(ZeroWidthBase): + _opcode = OP.DEFAULT_END_OF_WORD + _op_name = "DEFAULT_END_OF_WORD" + +class DefaultStartOfWord(ZeroWidthBase): + _opcode = OP.DEFAULT_START_OF_WORD + _op_name = "DEFAULT_START_OF_WORD" + +class EndOfLine(ZeroWidthBase): + _opcode = OP.END_OF_LINE + _op_name = "END_OF_LINE" + +class EndOfLineU(EndOfLine): + _opcode = OP.END_OF_LINE_U + _op_name = "END_OF_LINE_U" + +class EndOfString(ZeroWidthBase): + _opcode = OP.END_OF_STRING + _op_name = "END_OF_STRING" + +class EndOfStringLine(ZeroWidthBase): + _opcode = OP.END_OF_STRING_LINE + _op_name = "END_OF_STRING_LINE" + +class EndOfStringLineU(EndOfStringLine): + _opcode = OP.END_OF_STRING_LINE_U + _op_name = "END_OF_STRING_LINE_U" + +class EndOfWord(ZeroWidthBase): + _opcode = OP.END_OF_WORD + _op_name = "END_OF_WORD" + +class Failure(ZeroWidthBase): + _op_name = "FAILURE" + + def _compile(self, reverse, fuzzy): + return [(OP.FAILURE, )] + +class Fuzzy(RegexBase): + def __init__(self, subpattern, constraints=None): + RegexBase.__init__(self) + if constraints is None: + constraints = {} + self.subpattern = subpattern + self.constraints = constraints + + # If an error type is mentioned in the cost equation, then its maximum + # defaults to unlimited. + if "cost" in constraints: + for e in "dis": + if e in constraints["cost"]: + constraints.setdefault(e, (0, None)) + + # If any error type is mentioned, then all the error maxima default to + # 0, otherwise they default to unlimited. + if set(constraints) & set("dis"): + for e in "dis": + constraints.setdefault(e, (0, 0)) + else: + for e in "dis": + constraints.setdefault(e, (0, None)) + + # The maximum of the generic error type defaults to unlimited. + constraints.setdefault("e", (0, None)) + + # The cost equation defaults to equal costs. Also, the cost of any + # error type not mentioned in the cost equation defaults to 0. + if "cost" in constraints: + for e in "dis": + constraints["cost"].setdefault(e, 0) + else: + constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": + constraints["e"][1]} + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, True) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + return self + + def is_atomic(self): + return self.subpattern.is_atomic() + + def contains_group(self): + return self.subpattern.contains_group() + + def _compile(self, reverse, fuzzy): + # The individual limits. + arguments = [] + for e in "dise": + v = self.constraints[e] + arguments.append(v[0]) + arguments.append(UNLIMITED if v[1] is None else v[1]) + + # The coeffs of the cost equation. + for e in "dis": + arguments.append(self.constraints["cost"][e]) + + # The maximum of the cost equation. + v = self.constraints["cost"]["max"] + arguments.append(UNLIMITED if v is None else v) + + flags = 0 + if reverse: + flags |= REVERSE_OP + + test = self.constraints.get("test") + + if test: + return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] + + test.compile(reverse, True) + [(OP.NEXT,)] + + self.subpattern.compile(reverse, True) + [(OP.END,)]) + + return ([(OP.FUZZY, flags) + tuple(arguments)] + + self.subpattern.compile(reverse, True) + [(OP.END,)]) + + def dump(self, indent, reverse): + constraints = self._constraints_to_string() + if constraints: + constraints = " " + constraints + print("{}FUZZY{}".format(INDENT * indent, constraints)) + self.subpattern.dump(indent + 1, reverse) + + def is_empty(self): + return self.subpattern.is_empty() + + def __eq__(self, other): + return (type(self) is type(other) and self.subpattern == + other.subpattern and self.constraints == other.constraints) + + def max_width(self): + return UNLIMITED + + def _constraints_to_string(self): + constraints = [] + + for name in "ids": + min, max = self.constraints[name] + if max == 0: + continue + + con = "" + + if min > 0: + con = "{}<=".format(min) + + con += name + + if max is not None: + con += "<={}".format(max) + + constraints.append(con) + + cost = [] + for name in "ids": + coeff = self.constraints["cost"][name] + if coeff > 0: + cost.append("{}{}".format(coeff, name)) + + limit = self.constraints["cost"]["max"] + if limit is not None and limit > 0: + cost = "{}<={}".format("+".join(cost), limit) + constraints.append(cost) + + return ",".join(constraints) + +class Grapheme(RegexBase): + def _compile(self, reverse, fuzzy): + # Match at least 1 character until a grapheme boundary is reached. Note + # that this is the same whether matching forwards or backwards. + grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None), + GraphemeBoundary()])) + + return grapheme_matcher.compile(reverse, fuzzy) + + def dump(self, indent, reverse): + print("{}GRAPHEME".format(INDENT * indent)) + + def max_width(self): + return UNLIMITED + +class GraphemeBoundary: + def compile(self, reverse, fuzzy): + return [(OP.GRAPHEME_BOUNDARY, 1)] + +class GreedyRepeat(RegexBase): + _opcode = OP.GREEDY_REPEAT + _op_name = "GREEDY_REPEAT" + + def __init__(self, subpattern, min_count, max_count): + RegexBase.__init__(self) + self.subpattern = subpattern + self.min_count = min_count + self.max_count = max_count + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info, reverse): + subpattern = self.subpattern.optimise(info, reverse) + + return type(self)(subpattern, self.min_count, self.max_count) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + return self + + def is_atomic(self): + return self.min_count == self.max_count and self.subpattern.is_atomic() + + def can_be_affix(self): + return False + + def contains_group(self): + return self.subpattern.contains_group() + + def get_firstset(self, reverse): + fs = self.subpattern.get_firstset(reverse) + if self.min_count == 0: + fs.add(None) + + return fs + + def _compile(self, reverse, fuzzy): + repeat = [self._opcode, self.min_count] + if self.max_count is None: + repeat.append(UNLIMITED) + else: + repeat.append(self.max_count) + + subpattern = self.subpattern.compile(reverse, fuzzy) + if not subpattern: + return [] + + return ([tuple(repeat)] + subpattern + [(OP.END, )]) + + def dump(self, indent, reverse): + if self.max_count is None: + limit = "INF" + else: + limit = self.max_count + print("{}{} {} {}".format(INDENT * indent, self._op_name, + self.min_count, limit)) + + self.subpattern.dump(indent + 1, reverse) + + def is_empty(self): + return self.subpattern.is_empty() + + def __eq__(self, other): + return type(self) is type(other) and (self.subpattern, self.min_count, + self.max_count) == (other.subpattern, other.min_count, + other.max_count) + + def max_width(self): + if self.max_count is None: + return UNLIMITED + + return self.subpattern.max_width() * self.max_count + + def get_required_string(self, reverse): + max_count = UNLIMITED if self.max_count is None else self.max_count + if self.min_count == 0: + w = self.subpattern.max_width() * max_count + return min(w, UNLIMITED), None + + ofs, req = self.subpattern.get_required_string(reverse) + if req: + return ofs, req + + w = self.subpattern.max_width() * max_count + return min(w, UNLIMITED), None + +class PossessiveRepeat(GreedyRepeat): + def is_atomic(self): + return True + + def _compile(self, reverse, fuzzy): + subpattern = self.subpattern.compile(reverse, fuzzy) + if not subpattern: + return [] + + repeat = [self._opcode, self.min_count] + if self.max_count is None: + repeat.append(UNLIMITED) + else: + repeat.append(self.max_count) + + return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ), + (OP.END, )]) + + def dump(self, indent, reverse): + print("{}ATOMIC".format(INDENT * indent)) + + if self.max_count is None: + limit = "INF" + else: + limit = self.max_count + print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name, + self.min_count, limit)) + + self.subpattern.dump(indent + 2, reverse) + +class Group(RegexBase): + def __init__(self, info, group, subpattern): + RegexBase.__init__(self) + self.info = info + self.group = group + self.subpattern = subpattern + + self.call_ref = None + + def fix_groups(self, pattern, reverse, fuzzy): + self.info.defined_groups[self.group] = (self, reverse, fuzzy) + self.subpattern.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info, reverse): + subpattern = self.subpattern.optimise(info, reverse) + + return Group(self.info, self.group, subpattern) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + return self.subpattern.remove_captures() + + def is_atomic(self): + return self.subpattern.is_atomic() + + def can_be_affix(self): + return False + + def contains_group(self): + return True + + def get_firstset(self, reverse): + return self.subpattern.get_firstset(reverse) + + def has_simple_start(self): + return self.subpattern.has_simple_start() + + def _compile(self, reverse, fuzzy): + code = [] + + public_group = private_group = self.group + if private_group < 0: + public_group = self.info.private_groups[private_group] + private_group = self.info.group_count - private_group + + key = self.group, reverse, fuzzy + ref = self.info.call_refs.get(key) + if ref is not None: + code += [(OP.CALL_REF, ref)] + + code += [(OP.GROUP, int(not reverse), private_group, public_group)] + code += self.subpattern.compile(reverse, fuzzy) + code += [(OP.END, )] + + if ref is not None: + code += [(OP.END, )] + + return code + + def dump(self, indent, reverse): + group = self.group + if group < 0: + group = private_groups[group] + print("{}GROUP {}".format(INDENT * indent, group)) + self.subpattern.dump(indent + 1, reverse) + + def __eq__(self, other): + return (type(self) is type(other) and (self.group, self.subpattern) == + (other.group, other.subpattern)) + + def max_width(self): + return self.subpattern.max_width() + + def get_required_string(self, reverse): + return self.subpattern.get_required_string(reverse) + + def __del__(self): + self.info = None + +class Keep(ZeroWidthBase): + _opcode = OP.KEEP + _op_name = "KEEP" + +class LazyRepeat(GreedyRepeat): + _opcode = OP.LAZY_REPEAT + _op_name = "LAZY_REPEAT" + +class LookAround(RegexBase): + _dir_text = {False: "AHEAD", True: "BEHIND"} + + def __init__(self, behind, positive, subpattern): + RegexBase.__init__(self) + self.behind = bool(behind) + self.positive = bool(positive) + self.subpattern = subpattern + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, self.behind, fuzzy) + + def optimise(self, info, reverse): + subpattern = self.subpattern.optimise(info, self.behind) + if self.positive and subpattern.is_empty(): + return subpattern + + return LookAround(self.behind, self.positive, subpattern) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + return self + + def remove_captures(self): + return self.subpattern.remove_captures() + + def is_atomic(self): + return self.subpattern.is_atomic() + + def can_be_affix(self): + return self.subpattern.can_be_affix() + + def contains_group(self): + return self.subpattern.contains_group() + + def get_firstset(self, reverse): + if self.positive and self.behind == reverse: + return self.subpattern.get_firstset(reverse) + + return set([None]) + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if fuzzy: + flags |= FUZZY_OP + if reverse: + flags |= REVERSE_OP + + return ([(OP.LOOKAROUND, flags, int(not self.behind))] + + self.subpattern.compile(self.behind) + [(OP.END, )]) + + def dump(self, indent, reverse): + print("{}LOOK{} {}".format(INDENT * indent, + self._dir_text[self.behind], POS_TEXT[self.positive])) + self.subpattern.dump(indent + 1, self.behind) + + def is_empty(self): + return self.positive and self.subpattern.is_empty() + + def __eq__(self, other): + return type(self) is type(other) and (self.behind, self.positive, + self.subpattern) == (other.behind, other.positive, other.subpattern) + + def max_width(self): + return 0 + +class LookAroundConditional(RegexBase): + _dir_text = {False: "AHEAD", True: "BEHIND"} + + def __init__(self, behind, positive, subpattern, yes_item, no_item): + RegexBase.__init__(self) + self.behind = bool(behind) + self.positive = bool(positive) + self.subpattern = subpattern + self.yes_item = yes_item + self.no_item = no_item + + def fix_groups(self, pattern, reverse, fuzzy): + self.subpattern.fix_groups(pattern, reverse, fuzzy) + self.yes_item.fix_groups(pattern, reverse, fuzzy) + self.no_item.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info, reverse): + subpattern = self.subpattern.optimise(info, self.behind) + yes_item = self.yes_item.optimise(info, self.behind) + no_item = self.no_item.optimise(info, self.behind) + + return LookAroundConditional(self.behind, self.positive, subpattern, + yes_item, no_item) + + def pack_characters(self, info): + self.subpattern = self.subpattern.pack_characters(info) + self.yes_item = self.yes_item.pack_characters(info) + self.no_item = self.no_item.pack_characters(info) + return self + + def remove_captures(self): + self.subpattern = self.subpattern.remove_captures() + self.yes_item = self.yes_item.remove_captures() + self.no_item = self.no_item.remove_captures() + + def is_atomic(self): + return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and + self.no_item.is_atomic()) + + def can_be_affix(self): + return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix() + and self.no_item.can_be_affix()) + + def contains_group(self): + return (self.subpattern.contains_group() or + self.yes_item.contains_group() or self.no_item.contains_group()) + + def _compile(self, reverse, fuzzy): + code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))] + code.extend(self.subpattern.compile(self.behind, fuzzy)) + code.append((OP.NEXT, )) + code.extend(self.yes_item.compile(reverse, fuzzy)) + add_code = self.no_item.compile(reverse, fuzzy) + if add_code: + code.append((OP.NEXT, )) + code.extend(add_code) + + code.append((OP.END, )) + + return code + + def dump(self, indent, reverse): + print("{}CONDITIONAL {} {}".format(INDENT * indent, + self._dir_text[self.behind], POS_TEXT[self.positive])) + self.subpattern.dump(indent + 1, self.behind) + print("{}EITHER".format(INDENT * indent)) + self.yes_item.dump(indent + 1, reverse) + if not self.no_item.is_empty(): + print("{}OR".format(INDENT * indent)) + self.no_item.dump(indent + 1, reverse) + + def is_empty(self): + return (self.subpattern.is_empty() and self.yes_item.is_empty() or + self.no_item.is_empty()) + + def __eq__(self, other): + return type(self) is type(other) and (self.subpattern, self.yes_item, + self.no_item) == (other.subpattern, other.yes_item, other.no_item) + + def max_width(self): + return max(self.yes_item.max_width(), self.no_item.max_width()) + + def get_required_string(self, reverse): + return self.max_width(), None + +class PrecompiledCode(RegexBase): + def __init__(self, code): + self.code = code + + def _compile(self, reverse, fuzzy): + return [tuple(self.code)] + +class Property(RegexBase): + _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False): + OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False): + OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True): + OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE, + True): OP.PROPERTY_IGN_REV} + + def __init__(self, value, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.value = value + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + self._key = (self.__class__, self.value, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return Property(self.value, positive, case_flags, zerowidth) + + def optimise(self, info, reverse, in_set=False): + return self + + def get_firstset(self, reverse): + return set([self]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[self.case_flags, reverse], flags, self.value)] + + def dump(self, indent, reverse): + prop = PROPERTY_NAMES[self.value >> 16] + name, value = prop[0], prop[1][self.value & 0xFFFF] + print("{}PROPERTY {} {}:{}{}".format(INDENT * indent, + POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags])) + + def matches(self, ch): + return _regex.has_property_value(self.value, ch) == self.positive + + def max_width(self): + return 1 + +class Prune(ZeroWidthBase): + _op_name = "PRUNE" + + def _compile(self, reverse, fuzzy): + return [(OP.PRUNE, )] + +class Range(RegexBase): + _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN, + (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN, + (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV, + (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV} + _op_name = "RANGE" + + def __init__(self, lower, upper, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.lower = lower + self.upper = upper + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + self._key = (self.__class__, self.lower, self.upper, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return Range(self.lower, self.upper, positive, case_flags, zerowidth) + + def optimise(self, info, reverse, in_set=False): + # Is the range case-sensitive? + if not self.positive or not (self.case_flags & IGNORECASE) or in_set: + return self + + # Is full case-folding possible? + if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != + FULLIGNORECASE): + return self + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + # Get the folded characters in the range. + items = [] + for ch in expanding_chars: + if self.lower <= ord(ch) <= self.upper: + folded = _regex.fold_case(FULL_CASE_FOLDING, ch) + items.append(String([ord(c) for c in folded], + case_flags=self.case_flags)) + + if not items: + # We can fall back to simple case-folding. + return self + + if len(items) < self.upper - self.lower + 1: + # Not all the characters are covered by the full case-folding. + items.insert(0, self) + + return Branch(items) + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[self.case_flags, reverse], flags, self.lower, + self.upper)] + + def dump(self, indent, reverse): + display_lower = ascii(chr(self.lower)).lstrip("bu") + display_upper = ascii(chr(self.upper)).lstrip("bu") + print("{}RANGE {} {} {}{}".format(INDENT * indent, + POS_TEXT[self.positive], display_lower, display_upper, + CASE_TEXT[self.case_flags])) + + def matches(self, ch): + return (self.lower <= ch <= self.upper) == self.positive + + def max_width(self): + return 1 + +class RefGroup(RegexBase): + _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False): + OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE, + False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE, + True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV, + (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV} + + def __init__(self, info, group, position, case_flags=NOCASE): + RegexBase.__init__(self) + self.info = info + self.group = group + self.position = position + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + + self._key = self.__class__, self.group, self.case_flags + + def fix_groups(self, pattern, reverse, fuzzy): + try: + self.group = int(self.group) + except ValueError: + try: + self.group = self.info.group_index[self.group] + except KeyError: + raise error("unknown group", pattern, self.position) + + if not 1 <= self.group <= self.info.group_count: + raise error("invalid group reference", pattern, self.position) + + self._key = self.__class__, self.group, self.case_flags + + def remove_captures(self): + raise error("group reference not allowed", pattern, self.position) + + def _compile(self, reverse, fuzzy): + flags = 0 + if fuzzy: + flags |= FUZZY_OP + return [(self._opcode[self.case_flags, reverse], flags, self.group)] + + def dump(self, indent, reverse): + print("{}REF_GROUP {}{}".format(INDENT * indent, self.group, + CASE_TEXT[self.case_flags])) + + def max_width(self): + return UNLIMITED + + def __del__(self): + self.info = None + +class SearchAnchor(ZeroWidthBase): + _opcode = OP.SEARCH_ANCHOR + _op_name = "SEARCH_ANCHOR" + +class Sequence(RegexBase): + def __init__(self, items=None): + RegexBase.__init__(self) + if items is None: + items = [] + + self.items = items + + def fix_groups(self, pattern, reverse, fuzzy): + for s in self.items: + s.fix_groups(pattern, reverse, fuzzy) + + def optimise(self, info, reverse): + # Flatten the sequences. + items = [] + for s in self.items: + s = s.optimise(info, reverse) + if isinstance(s, Sequence): + items.extend(s.items) + else: + items.append(s) + + return make_sequence(items) + + def pack_characters(self, info): + "Packs sequences of characters into strings." + items = [] + characters = [] + case_flags = NOCASE + for s in self.items: + if type(s) is Character and s.positive and not s.zerowidth: + if s.case_flags != case_flags: + # Different case sensitivity, so flush, unless neither the + # previous nor the new character are cased. + if s.case_flags or is_cased_i(info, s.value): + Sequence._flush_characters(info, characters, + case_flags, items) + + case_flags = s.case_flags + + characters.append(s.value) + elif type(s) is String or type(s) is Literal: + if s.case_flags != case_flags: + # Different case sensitivity, so flush, unless the neither + # the previous nor the new string are cased. + if s.case_flags or any(is_cased_i(info, c) for c in + characters): + Sequence._flush_characters(info, characters, + case_flags, items) + + case_flags = s.case_flags + + characters.extend(s.characters) + else: + Sequence._flush_characters(info, characters, case_flags, items) + + items.append(s.pack_characters(info)) + + Sequence._flush_characters(info, characters, case_flags, items) + + return make_sequence(items) + + def remove_captures(self): + self.items = [s.remove_captures() for s in self.items] + return self + + def is_atomic(self): + return all(s.is_atomic() for s in self.items) + + def can_be_affix(self): + return False + + def contains_group(self): + return any(s.contains_group() for s in self.items) + + def get_firstset(self, reverse): + fs = set() + items = self.items + if reverse: + items.reverse() + for s in items: + fs |= s.get_firstset(reverse) + if None not in fs: + return fs + fs.discard(None) + + return fs | set([None]) + + def has_simple_start(self): + return bool(self.items) and self.items[0].has_simple_start() + + def _compile(self, reverse, fuzzy): + seq = self.items + if reverse: + seq = seq[::-1] + + code = [] + for s in seq: + code.extend(s.compile(reverse, fuzzy)) + + return code + + def dump(self, indent, reverse): + for s in self.items: + s.dump(indent, reverse) + + @staticmethod + def _flush_characters(info, characters, case_flags, items): + if not characters: + return + + # Disregard case_flags if all of the characters are case-less. + if case_flags & IGNORECASE: + if not any(is_cased_i(info, c) for c in characters): + case_flags = NOCASE + + if (case_flags & FULLIGNORECASE) == FULLIGNORECASE: + literals = Sequence._fix_full_casefold(characters) + + for item in literals: + chars = item.characters + + if len(chars) == 1: + items.append(Character(chars[0], case_flags=item.case_flags)) + else: + items.append(String(chars, case_flags=item.case_flags)) + else: + if len(characters) == 1: + items.append(Character(characters[0], case_flags=case_flags)) + else: + items.append(String(characters, case_flags=case_flags)) + + characters[:] = [] + + @staticmethod + def _fix_full_casefold(characters): + # Split a literal needing full case-folding into chunks that need it + # and chunks that can use simple case-folding, which is faster. + expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in + _regex.get_expand_on_folding()] + string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c) + for c in characters)).lower() + chunks = [] + + for e in expanded: + found = string.find(e) + + while found >= 0: + chunks.append((found, found + len(e))) + found = string.find(e, found + 1) + + pos = 0 + literals = [] + + for start, end in Sequence._merge_chunks(chunks): + if pos < start: + literals.append(Literal(characters[pos : start], + case_flags=IGNORECASE)) + + literals.append(Literal(characters[start : end], + case_flags=FULLIGNORECASE)) + pos = end + + if pos < len(characters): + literals.append(Literal(characters[pos : ], case_flags=IGNORECASE)) + + return literals + + @staticmethod + def _merge_chunks(chunks): + if len(chunks) < 2: + return chunks + + chunks.sort() + + start, end = chunks[0] + new_chunks = [] + + for s, e in chunks[1 : ]: + if s <= end: + end = max(end, e) + else: + new_chunks.append((start, end)) + start, end = s, e + + new_chunks.append((start, end)) + + return new_chunks + + def is_empty(self): + return all(i.is_empty() for i in self.items) + + def __eq__(self, other): + return type(self) is type(other) and self.items == other.items + + def max_width(self): + return sum(s.max_width() for s in self.items) + + def get_required_string(self, reverse): + seq = self.items + if reverse: + seq = seq[::-1] + + offset = 0 + + for s in seq: + ofs, req = s.get_required_string(reverse) + offset += ofs + if req: + return offset, req + + return offset, None + +class SetBase(RegexBase): + def __init__(self, info, items, positive=True, case_flags=NOCASE, + zerowidth=False): + RegexBase.__init__(self) + self.info = info + self.items = tuple(items) + self.positive = bool(positive) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + self.zerowidth = bool(zerowidth) + + self.char_width = 1 + + self._key = (self.__class__, self.items, self.positive, + self.case_flags, self.zerowidth) + + def rebuild(self, positive, case_flags, zerowidth): + return type(self)(self.info, self.items, positive, case_flags, + zerowidth).optimise(self.info, False) + + def get_firstset(self, reverse): + return set([self]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + code = [(self._opcode[self.case_flags, reverse], flags)] + for m in self.items: + code.extend(m.compile()) + + code.append((OP.END, )) + + return code + + def dump(self, indent, reverse): + print("{}{} {}{}".format(INDENT * indent, self._op_name, + POS_TEXT[self.positive], CASE_TEXT[self.case_flags])) + for i in self.items: + i.dump(indent + 1, reverse) + + def _handle_case_folding(self, info, in_set): + # Is the set case-sensitive? + if not self.positive or not (self.case_flags & IGNORECASE) or in_set: + return self + + # Is full case-folding possible? + if (not (self.info.flags & UNICODE) or (self.case_flags & + FULLIGNORECASE) != FULLIGNORECASE): + return self + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + # Get the folded characters in the set. + items = [] + seen = set() + for ch in expanding_chars: + if self.matches(ord(ch)): + folded = _regex.fold_case(FULL_CASE_FOLDING, ch) + if folded not in seen: + items.append(String([ord(c) for c in folded], + case_flags=self.case_flags)) + seen.add(folded) + + if not items: + # We can fall back to simple case-folding. + return self + + return Branch([self] + items) + + def max_width(self): + # Is the set case-sensitive? + if not self.positive or not (self.case_flags & IGNORECASE): + return 1 + + # Is full case-folding possible? + if (not (self.info.flags & UNICODE) or (self.case_flags & + FULLIGNORECASE) != FULLIGNORECASE): + return 1 + + # Get the characters which expand to multiple codepoints on folding. + expanding_chars = _regex.get_expand_on_folding() + + # Get the folded characters in the set. + seen = set() + for ch in expanding_chars: + if self.matches(ord(ch)): + folded = _regex.fold_case(FULL_CASE_FOLDING, ch) + seen.add(folded) + + if not seen: + return 1 + + return max(len(folded) for folded in seen) + + def __del__(self): + self.info = None + +class SetDiff(SetBase): + _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False): + OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False): + OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True): + OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE, + True): OP.SET_DIFF_IGN_REV} + _op_name = "SET_DIFF" + + def optimise(self, info, reverse, in_set=False): + items = self.items + if len(items) > 2: + items = [items[0], SetUnion(info, items[1 : ])] + + if len(items) == 1: + return items[0].with_flags(case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, reverse, in_set) + + self.items = tuple(m.optimise(info, reverse, in_set=True) for m in + items) + + return self._handle_case_folding(info, in_set) + + def matches(self, ch): + m = self.items[0].matches(ch) and not self.items[1].matches(ch) + return m == self.positive + +class SetInter(SetBase): + _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False): + OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE, + False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE, + True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV, + (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV} + _op_name = "SET_INTER" + + def optimise(self, info, reverse, in_set=False): + items = [] + for m in self.items: + m = m.optimise(info, reverse, in_set=True) + if isinstance(m, SetInter) and m.positive: + # Intersection in intersection. + items.extend(m.items) + else: + items.append(m) + + if len(items) == 1: + return items[0].with_flags(case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, reverse, in_set) + + self.items = tuple(items) + + return self._handle_case_folding(info, in_set) + + def matches(self, ch): + m = all(i.matches(ch) for i in self.items) + return m == self.positive + +class SetSymDiff(SetBase): + _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False): + OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE, + False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV, + (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True): + OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV} + _op_name = "SET_SYM_DIFF" + + def optimise(self, info, reverse, in_set=False): + items = [] + for m in self.items: + m = m.optimise(info, reverse, in_set=True) + if isinstance(m, SetSymDiff) and m.positive: + # Symmetric difference in symmetric difference. + items.extend(m.items) + else: + items.append(m) + + if len(items) == 1: + return items[0].with_flags(case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, reverse, in_set) + + self.items = tuple(items) + + return self._handle_case_folding(info, in_set) + + def matches(self, ch): + m = False + for i in self.items: + m = m != i.matches(ch) + + return m == self.positive + +class SetUnion(SetBase): + _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False): + OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE, + False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE, + True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV, + (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV} + _op_name = "SET_UNION" + + def optimise(self, info, reverse, in_set=False): + items = [] + for m in self.items: + m = m.optimise(info, reverse, in_set=True) + if isinstance(m, SetUnion) and m.positive: + # Union in union. + items.extend(m.items) + else: + items.append(m) + + if len(items) == 1: + i = items[0] + return i.with_flags(positive=i.positive == self.positive, + case_flags=self.case_flags, + zerowidth=self.zerowidth).optimise(info, reverse, in_set) + + self.items = tuple(items) + + return self._handle_case_folding(info, in_set) + + def _compile(self, reverse, fuzzy): + flags = 0 + if self.positive: + flags |= POSITIVE_OP + if self.zerowidth: + flags |= ZEROWIDTH_OP + if fuzzy: + flags |= FUZZY_OP + + characters, others = defaultdict(list), [] + for m in self.items: + if isinstance(m, Character): + characters[m.positive].append(m.value) + else: + others.append(m) + + code = [(self._opcode[self.case_flags, reverse], flags)] + + for positive, values in characters.items(): + flags = 0 + if positive: + flags |= POSITIVE_OP + if len(values) == 1: + code.append((OP.CHARACTER, flags, values[0])) + else: + code.append((OP.STRING, flags, len(values)) + tuple(values)) + + for m in others: + code.extend(m.compile()) + + code.append((OP.END, )) + + return code + + def matches(self, ch): + m = any(i.matches(ch) for i in self.items) + return m == self.positive + +class Skip(ZeroWidthBase): + _op_name = "SKIP" + _opcode = OP.SKIP + +class StartOfLine(ZeroWidthBase): + _opcode = OP.START_OF_LINE + _op_name = "START_OF_LINE" + +class StartOfLineU(StartOfLine): + _opcode = OP.START_OF_LINE_U + _op_name = "START_OF_LINE_U" + +class StartOfString(ZeroWidthBase): + _opcode = OP.START_OF_STRING + _op_name = "START_OF_STRING" + +class StartOfWord(ZeroWidthBase): + _opcode = OP.START_OF_WORD + _op_name = "START_OF_WORD" + +class String(RegexBase): + _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN, + (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD, + (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV, + (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True): + OP.STRING_FLD_REV} + + def __init__(self, characters, case_flags=NOCASE): + self.characters = tuple(characters) + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + + if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE: + folded_characters = [] + for char in self.characters: + folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char)) + folded_characters.extend(ord(c) for c in folded) + else: + folded_characters = self.characters + + self.folded_characters = tuple(folded_characters) + self.required = False + + self._key = self.__class__, self.characters, self.case_flags + + def get_firstset(self, reverse): + if reverse: + pos = -1 + else: + pos = 0 + return set([Character(self.characters[pos], + case_flags=self.case_flags)]) + + def has_simple_start(self): + return True + + def _compile(self, reverse, fuzzy): + flags = 0 + if fuzzy: + flags |= FUZZY_OP + if self.required: + flags |= REQUIRED_OP + return [(self._opcode[self.case_flags, reverse], flags, + len(self.folded_characters)) + self.folded_characters] + + def dump(self, indent, reverse): + display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu") + print("{}STRING {}{}".format(INDENT * indent, display, + CASE_TEXT[self.case_flags])) + + def max_width(self): + return len(self.folded_characters) + + def get_required_string(self, reverse): + return 0, self + +class Literal(String): + def dump(self, indent, reverse): + literal = ''.join(chr(c) for c in self.characters) + display = ascii(literal).lstrip("bu") + print("{}LITERAL MATCH {}{}".format(INDENT * indent, display, + CASE_TEXT[self.case_flags])) + +class StringSet(Branch): + def __init__(self, info, name, case_flags=NOCASE): + self.info = info + self.name = name + self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] + + self._key = self.__class__, self.name, self.case_flags + + self.set_key = (name, self.case_flags) + if self.set_key not in info.named_lists_used: + info.named_lists_used[self.set_key] = len(info.named_lists_used) + + index = self.info.named_lists_used[self.set_key] + items = self.info.kwargs[self.name] + + case_flags = self.case_flags + + encoding = self.info.flags & _ALL_ENCODINGS + fold_flags = encoding | case_flags + + choices = [] + + for string in items: + if isinstance(string, str): + string = [ord(c) for c in string] + + choices.append([Character(c, case_flags=case_flags) for c in + string]) + + # Sort from longest to shortest. + choices.sort(key=len, reverse=True) + + self.branches = [Sequence(choice) for choice in choices] + + def dump(self, indent, reverse): + print("{}STRING_SET {}{}".format(INDENT * indent, self.name, + CASE_TEXT[self.case_flags])) + + def __del__(self): + self.info = None + +class Source: + "Scanner for the regular expression source string." + def __init__(self, string): + if isinstance(string, str): + self.string = string + self.char_type = chr + else: + self.string = string.decode("latin-1") + self.char_type = lambda c: bytes([c]) + + self.pos = 0 + self.ignore_space = False + self.sep = string[ : 0] + + def get(self, override_ignore=False): + string = self.string + pos = self.pos + + try: + if self.ignore_space and not override_ignore: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + else: + break + + ch = string[pos] + self.pos = pos + 1 + return ch + except IndexError: + # We've reached the end of the string. + self.pos = pos + return string[ : 0] + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + return string[ : 0] + + def get_many(self, count=1): + string = self.string + pos = self.pos + + try: + if self.ignore_space: + substring = [] + + while len(substring) < count: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + else: + break + + substring.append(string[pos]) + pos += 1 + + substring = "".join(substring) + else: + substring = string[pos : pos + count] + pos += len(substring) + + self.pos = pos + return substring + except IndexError: + # We've reached the end of the string. + self.pos = len(string) + return "".join(substring) + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + return "".join(substring) + + def get_while(self, test_set, include=True, keep_spaces=False): + string = self.string + pos = self.pos + + if self.ignore_space and not keep_spaces: + try: + substring = [] + + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + elif (string[pos] in test_set) == include: + substring.append(string[pos]) + pos += 1 + else: + break + + self.pos = pos + except IndexError: + # We've reached the end of the string. + self.pos = len(string) + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + + return "".join(substring) + else: + try: + while (string[pos] in test_set) == include: + pos += 1 + + substring = string[self.pos : pos] + + self.pos = pos + + return substring + except IndexError: + # We've reached the end of the string. + substring = string[self.pos : pos] + + self.pos = pos + + return substring + + def skip_while(self, test_set, include=True): + string = self.string + pos = self.pos + + try: + if self.ignore_space: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + elif (string[pos] in test_set) == include: + pos += 1 + else: + break + else: + while (string[pos] in test_set) == include: + pos += 1 + + self.pos = pos + except IndexError: + # We've reached the end of the string. + self.pos = len(string) + except ValueError: + # The comment extended to the end of the string. + self.pos = len(string) + + def match(self, substring): + string = self.string + pos = self.pos + + if self.ignore_space: + try: + for c in substring: + while True: + if string[pos].isspace(): + # Skip over the whitespace. + pos += 1 + elif string[pos] == "#": + # Skip over the comment to the end of the line. + pos = string.index("\n", pos) + else: + break + + if string[pos] != c: + return False + + pos += 1 + + self.pos = pos + + return True + except IndexError: + # We've reached the end of the string. + return False + except ValueError: + # The comment extended to the end of the string. + return False + else: + if not string.startswith(substring, pos): + return False + + self.pos = pos + len(substring) + + return True + + def expect(self, substring): + if not self.match(substring): + raise error("missing {}".format(substring), self.string, self.pos) + + def at_end(self): + string = self.string + pos = self.pos + + try: + if self.ignore_space: + while True: + if string[pos].isspace(): + pos += 1 + elif string[pos] == "#": + pos = string.index("\n", pos) + else: + break + + return pos >= len(string) + except IndexError: + # We've reached the end of the string. + return True + except ValueError: + # The comment extended to the end of the string. + return True + +class Info: + "Info about the regular expression." + + def __init__(self, flags=0, char_type=None, kwargs={}): + flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] + self.flags = flags + self.global_flags = flags + self.inline_locale = False + + self.kwargs = kwargs + + self.group_count = 0 + self.group_index = {} + self.group_name = {} + self.char_type = char_type + self.named_lists_used = {} + self.open_groups = [] + self.open_group_count = {} + self.defined_groups = {} + self.group_calls = [] + self.private_groups = {} + + def open_group(self, name=None): + group = self.group_index.get(name) + if group is None: + while True: + self.group_count += 1 + if name is None or self.group_count not in self.group_name: + break + + group = self.group_count + if name: + self.group_index[name] = group + self.group_name[group] = name + + if group in self.open_groups: + # We have a nested named group. We'll assign it a private group + # number, initially negative until we can assign a proper + # (positive) number. + group_alias = -(len(self.private_groups) + 1) + self.private_groups[group_alias] = group + group = group_alias + + self.open_groups.append(group) + self.open_group_count[group] = self.open_group_count.get(group, 0) + 1 + + return group + + def close_group(self): + self.open_groups.pop() + + def is_open_group(self, name): + # In version 1, a group reference can refer to an open group. We'll + # just pretend the group isn't open. + version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION + if version == VERSION1: + return False + + if name.isdigit(): + group = int(name) + else: + group = self.group_index.get(name) + + return group in self.open_groups + +def _check_group_features(info, parsed): + """Checks whether the reverse and fuzzy features of the group calls match + the groups which they call. + """ + call_refs = {} + additional_groups = [] + for call, reverse, fuzzy in info.group_calls: + # Look up the reference of this group call. + key = (call.group, reverse, fuzzy) + ref = call_refs.get(key) + if ref is None: + # This group doesn't have a reference yet, so look up its features. + if call.group == 0: + # Calling the pattern as a whole. + rev = bool(info.flags & REVERSE) + fuz = isinstance(parsed, Fuzzy) + if (rev, fuz) != (reverse, fuzzy): + # The pattern as a whole doesn't have the features we want, + # so we'll need to make a copy of it with the desired + # features. + additional_groups.append((CallRef(len(call_refs), parsed), + reverse, fuzzy)) + else: + # Calling a capture group. + def_info = info.defined_groups[call.group] + group = def_info[0] + if def_info[1 : ] != (reverse, fuzzy): + # The group doesn't have the features we want, so we'll + # need to make a copy of it with the desired features. + additional_groups.append((group, reverse, fuzzy)) + + ref = len(call_refs) + call_refs[key] = ref + + call.call_ref = ref + + info.call_refs = call_refs + info.additional_groups = additional_groups + +def _get_required_string(parsed, flags): + "Gets the required string and related info of a parsed pattern." + + req_offset, required = parsed.get_required_string(bool(flags & REVERSE)) + if required: + required.required = True + if req_offset >= UNLIMITED: + req_offset = -1 + + req_flags = required.case_flags + if not (flags & UNICODE): + req_flags &= ~UNICODE + + req_chars = required.folded_characters + else: + req_offset = 0 + req_chars = () + req_flags = 0 + + return req_offset, req_chars, req_flags + +class Scanner: + def __init__(self, lexicon, flags=0): + self.lexicon = lexicon + + # Combine phrases into a compound pattern. + patterns = [] + for phrase, action in lexicon: + # Parse the regular expression. + source = Source(phrase) + info = Info(flags, source.char_type) + source.ignore_space = bool(info.flags & VERBOSE) + parsed = _parse_pattern(source, info) + if not source.at_end(): + raise error("unbalanced parenthesis", source.string, + source.pos) + + # We want to forbid capture groups within each phrase. + patterns.append(parsed.remove_captures()) + + # Combine all the subpatterns into one pattern. + info = Info(flags) + patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)] + parsed = Branch(patterns) + + # Optimise the compound pattern. + reverse = bool(info.flags & REVERSE) + parsed = parsed.optimise(info, reverse) + parsed = parsed.pack_characters(info) + + # Get the required string. + req_offset, req_chars, req_flags = _get_required_string(parsed, + info.flags) + + # Check the features of the groups. + _check_group_features(info, parsed) + + # Complain if there are any group calls. They are not supported by the + # Scanner class. + if info.call_refs: + raise error("recursive regex not supported by Scanner", + source.string, source.pos) + + reverse = bool(info.flags & REVERSE) + + # Compile the compound pattern. The result is a list of tuples. + code = parsed.compile(reverse) + [(OP.SUCCESS, )] + + # Flatten the code into a list of ints. + code = _flatten_code(code) + + if not parsed.has_simple_start(): + # Get the first set, if possible. + try: + fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) + fs_code = _flatten_code(fs_code) + code = fs_code + code + except _FirstSetError: + pass + + # Check the global flags for conflicts. + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + if version not in (0, VERSION0, VERSION1): + raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") + + # Create the PatternObject. + # + # Local flags like IGNORECASE affect the code generation, but aren't + # needed by the PatternObject itself. Conversely, global flags like + # LOCALE _don't_ affect the code generation but _are_ needed by the + # PatternObject. + self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version, + code, {}, {}, {}, [], req_offset, req_chars, req_flags, + len(patterns)) + + def scan(self, string): + result = [] + append = result.append + match = self.scanner.scanner(string).match + i = 0 + while True: + m = match() + if not m: + break + j = m.end() + if i == j: + break + action = self.lexicon[m.lastindex - 1][1] + if hasattr(action, '__call__'): + self.match = m + action = action(self, m.group()) + if action is not None: + append(action) + i = j + + return result, string[i : ] + +# Get the known properties dict. +PROPERTIES = _regex.get_properties() + +# Build the inverse of the properties dict. +PROPERTY_NAMES = {} +for prop_name, (prop_id, values) in PROPERTIES.items(): + name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {})) + name = max(name, prop_name, key=len) + PROPERTY_NAMES[prop_id] = name, prop_values + + for val_name, val_id in values.items(): + prop_values[val_id] = max(prop_values.get(val_id, ""), val_name, + key=len) + +# Character escape sequences. +CHARACTER_ESCAPES = { + "a": "\a", + "b": "\b", + "f": "\f", + "n": "\n", + "r": "\r", + "t": "\t", + "v": "\v", +} + +# Predefined character set escape sequences. +CHARSET_ESCAPES = { + "d": lookup_property(None, "Digit", True), + "D": lookup_property(None, "Digit", False), + "h": lookup_property(None, "Blank", True), + "s": lookup_property(None, "Space", True), + "S": lookup_property(None, "Space", False), + "w": lookup_property(None, "Word", True), + "W": lookup_property(None, "Word", False), +} + +# Positional escape sequences. +POSITION_ESCAPES = { + "A": StartOfString(), + "b": Boundary(), + "B": Boundary(False), + "K": Keep(), + "m": StartOfWord(), + "M": EndOfWord(), + "Z": EndOfString(), +} + +# Positional escape sequences when WORD flag set. +WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) +WORD_POSITION_ESCAPES.update({ + "b": DefaultBoundary(), + "B": DefaultBoundary(False), + "m": DefaultStartOfWord(), + "M": DefaultEndOfWord(), +}) + +# Regex control verbs. +VERBS = { + "FAIL": Failure(), + "F": Failure(), + "PRUNE": Prune(), + "SKIP": Skip(), +} diff --git a/addon/synthDrivers/regex/regex/regex.py b/addon/synthDrivers/regex/regex/regex.py new file mode 100644 index 0000000..0fdb4da --- /dev/null +++ b/addon/synthDrivers/regex/regex/regex.py @@ -0,0 +1,746 @@ +# +# Secret Labs' Regular Expression Engine +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# +# 2010-01-16 mrab Python front-end re-written and extended + +r"""Support for regular expressions (RE). + +This module provides regular expression matching operations similar to those +found in Perl. It supports both 8-bit and Unicode strings; both the pattern and +the strings being processed can contain null bytes and characters outside the +US ASCII range. + +Regular expressions can contain both special and ordinary characters. Most +ordinary characters, like "A", "a", or "0", are the simplest regular +expressions; they simply match themselves. You can concatenate ordinary +characters, so last matches the string 'last'. + +There are a few differences between the old (legacy) behaviour and the new +(enhanced) behaviour, which are indicated by VERSION0 or VERSION1. + +The special characters are: + "." Matches any character except a newline. + "^" Matches the start of the string. + "$" Matches the end of the string or just before the + newline at the end of the string. + "*" Matches 0 or more (greedy) repetitions of the preceding + RE. Greedy means that it will match as many repetitions + as possible. + "+" Matches 1 or more (greedy) repetitions of the preceding + RE. + "?" Matches 0 or 1 (greedy) of the preceding RE. + *?,+?,?? Non-greedy versions of the previous three special + characters. + *+,++,?+ Possessive versions of the previous three special + characters. + {m,n} Matches from m to n repetitions of the preceding RE. + {m,n}? Non-greedy version of the above. + {m,n}+ Possessive version of the above. + {...} Fuzzy matching constraints. + "\\" Either escapes special characters or signals a special + sequence. + [...] Indicates a set of characters. A "^" as the first + character indicates a complementing set. + "|" A|B, creates an RE that will match either A or B. + (...) Matches the RE inside the parentheses. The contents are + captured and can be retrieved or matched later in the + string. + (?flags-flags) VERSION1: Sets/clears the flags for the remainder of + the group or pattern; VERSION0: Sets the flags for the + entire pattern. + (?:...) Non-capturing version of regular parentheses. + (?>...) Atomic non-capturing version of regular parentheses. + (?flags-flags:...) Non-capturing version of regular parentheses with local + flags. + (?P...) The substring matched by the group is accessible by + name. + (?...) The substring matched by the group is accessible by + name. + (?P=name) Matches the text matched earlier by the group named + name. + (?#...) A comment; ignored. + (?=...) Matches if ... matches next, but doesn't consume the + string. + (?!...) Matches if ... doesn't match next. + (?<=...) Matches if preceded by .... + (? Matches the text matched by the group named name. + \G Matches the empty string, but only at the position where + the search started. + \h Matches horizontal whitespace. + \K Keeps only what follows for the entire match. + \L Named list. The list is provided as a keyword argument. + \m Matches the empty string, but only at the start of a word. + \M Matches the empty string, but only at the end of a word. + \n Matches the newline character. + \N{name} Matches the named character. + \p{name=value} Matches the character if its property has the specified + value. + \P{name=value} Matches the character if its property hasn't the specified + value. + \r Matches the carriage-return character. + \s Matches any whitespace character; equivalent to + [ \t\n\r\f\v]. + \S Matches any non-whitespace character; equivalent to [^\s]. + \t Matches the tab character. + \uXXXX Matches the Unicode codepoint with 4-digit hex code XXXX. + \UXXXXXXXX Matches the Unicode codepoint with 8-digit hex code + XXXXXXXX. + \v Matches the vertical tab character. + \w Matches any alphanumeric character; equivalent to + [a-zA-Z0-9_] when matching a bytestring or a Unicode string + with the ASCII flag, or the whole range of Unicode + alphanumeric characters (letters plus digits plus + underscore) when matching a Unicode string. With LOCALE, it + will match the set [0-9_] plus characters defined as + letters for the current locale. + \W Matches the complement of \w; equivalent to [^\w]. + \xXX Matches the character with 2-digit hex code XX. + \X Matches a grapheme. + \Z Matches only at the end of the string. + \\ Matches a literal backslash. + +This module exports the following functions: + match Match a regular expression pattern at the beginning of a string. + fullmatch Match a regular expression pattern against all of a string. + search Search a string for the presence of a pattern. + sub Substitute occurrences of a pattern found in a string using a + template string. + subf Substitute occurrences of a pattern found in a string using a + format string. + subn Same as sub, but also return the number of substitutions made. + subfn Same as subf, but also return the number of substitutions made. + split Split a string by the occurrences of a pattern. VERSION1: will + split at zero-width match; VERSION0: won't split at zero-width + match. + splititer Return an iterator yielding the parts of a split string. + findall Find all occurrences of a pattern in a string. + finditer Return an iterator yielding a match object for each match. + compile Compile a pattern into a Pattern object. + purge Clear the regular expression cache. + escape Backslash all non-alphanumerics or special characters in a + string. + +Most of the functions support a concurrent parameter: if True, the GIL will be +released during matching, allowing other Python threads to run concurrently. If +the string changes during matching, the behaviour is undefined. This parameter +is not needed when working on the builtin (immutable) string classes. + +Some of the functions in this module take flags as optional parameters. Most of +these flags can also be set within an RE: + A a ASCII Make \w, \W, \b, \B, \d, and \D match the + corresponding ASCII character categories. Default + when matching a bytestring. + B b BESTMATCH Find the best fuzzy match (default is first). + D DEBUG Print the parsed pattern. + E e ENHANCEMATCH Attempt to improve the fit after finding the first + fuzzy match. + F f FULLCASE Use full case-folding when performing + case-insensitive matching in Unicode. + I i IGNORECASE Perform case-insensitive matching. + L L LOCALE Make \w, \W, \b, \B, \d, and \D dependent on the + current locale. (One byte per character only.) + M m MULTILINE "^" matches the beginning of lines (after a newline) + as well as the string. "$" matches the end of lines + (before a newline) as well as the end of the string. + P p POSIX Perform POSIX-standard matching (leftmost longest). + R r REVERSE Searches backwards. + S s DOTALL "." matches any character at all, including the + newline. + U u UNICODE Make \w, \W, \b, \B, \d, and \D dependent on the + Unicode locale. Default when matching a Unicode + string. + V0 V0 VERSION0 Turn on the old legacy behaviour. + V1 V1 VERSION1 Turn on the new enhanced behaviour. This flag + includes the FULLCASE flag. + W w WORD Make \b and \B work with default Unicode word breaks + and make ".", "^" and "$" work with Unicode line + breaks. + X x VERBOSE Ignore whitespace and comments for nicer looking REs. + +This module also defines an exception 'error'. + +""" + +# Public symbols. +__all__ = ["cache_all", "compile", "DEFAULT_VERSION", "escape", "findall", + "finditer", "fullmatch", "match", "purge", "search", "split", "splititer", + "sub", "subf", "subfn", "subn", "template", "Scanner", "A", "ASCII", "B", + "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", "S", "DOTALL", "F", + "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", "POSIX", + "R", "REVERSE", "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", + "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex", "__version__", + "__doc__", "RegexFlag"] + +__version__ = "2.5.148" + +# -------------------------------------------------------------------- +# Public interface. + +def match(pattern, string, flags=0, pos=None, endpos=None, partial=False, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Try to apply the pattern at the start of the string, returning a match + object, or None if no match was found.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.match(string, pos, endpos, concurrent, partial, timeout) + +def fullmatch(pattern, string, flags=0, pos=None, endpos=None, partial=False, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Try to apply the pattern against all of the string, returning a match + object, or None if no match was found.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.fullmatch(string, pos, endpos, concurrent, partial, timeout) + +def search(pattern, string, flags=0, pos=None, endpos=None, partial=False, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Search through string looking for a match to the pattern, returning a + match object, or None if no match was found.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.search(string, pos, endpos, concurrent, partial, timeout) + +def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Return the string obtained by replacing the leftmost (or rightmost with a + reverse pattern) non-overlapping occurrences of the pattern in string by the + replacement repl. repl can be either a string or a callable; if a string, + backslash escapes in it are processed; if a callable, it's passed the match + object and must return a replacement string to be used.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.sub(repl, string, count, pos, endpos, concurrent, timeout) + +def subf(pattern, format, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Return the string obtained by replacing the leftmost (or rightmost with a + reverse pattern) non-overlapping occurrences of the pattern in string by the + replacement format. format can be either a string or a callable; if a string, + it's treated as a format string; if a callable, it's passed the match object + and must return a replacement string to be used.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.subf(format, string, count, pos, endpos, concurrent, timeout) + +def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Return a 2-tuple containing (new_string, number). new_string is the string + obtained by replacing the leftmost (or rightmost with a reverse pattern) + non-overlapping occurrences of the pattern in the source string by the + replacement repl. number is the number of substitutions that were made. repl + can be either a string or a callable; if a string, backslash escapes in it + are processed; if a callable, it's passed the match object and must return a + replacement string to be used.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.subn(repl, string, count, pos, endpos, concurrent, timeout) + +def subfn(pattern, format, string, count=0, flags=0, pos=None, endpos=None, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Return a 2-tuple containing (new_string, number). new_string is the string + obtained by replacing the leftmost (or rightmost with a reverse pattern) + non-overlapping occurrences of the pattern in the source string by the + replacement format. number is the number of substitutions that were made. format + can be either a string or a callable; if a string, it's treated as a format + string; if a callable, it's passed the match object and must return a + replacement string to be used.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.subfn(format, string, count, pos, endpos, concurrent, timeout) + +def split(pattern, string, maxsplit=0, flags=0, concurrent=None, timeout=None, + ignore_unused=False, **kwargs): + """Split the source string by the occurrences of the pattern, returning a + list containing the resulting substrings. If capturing parentheses are used + in pattern, then the text of all groups in the pattern are also returned as + part of the resulting list. If maxsplit is nonzero, at most maxsplit splits + occur, and the remainder of the string is returned as the final element of + the list.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.split(string, maxsplit, concurrent, timeout) + +def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None, + timeout=None, ignore_unused=False, **kwargs): + "Return an iterator yielding the parts of a split string." + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.splititer(string, maxsplit, concurrent, timeout) + +def findall(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, + concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Return a list of all matches in the string. The matches may be overlapped + if overlapped is True. If one or more groups are present in the pattern, + return a list of groups; this will be a list of tuples if the pattern has + more than one group. Empty matches are included in the result.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.findall(string, pos, endpos, overlapped, concurrent, timeout) + +def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, + partial=False, concurrent=None, timeout=None, ignore_unused=False, **kwargs): + """Return an iterator over all matches in the string. The matches may be + overlapped if overlapped is True. For each match, the iterator returns a + match object. Empty matches are included in the result.""" + pat = _compile(pattern, flags, ignore_unused, kwargs, True) + return pat.finditer(string, pos, endpos, overlapped, concurrent, partial, + timeout) + +def compile(pattern, flags=0, ignore_unused=False, cache_pattern=None, **kwargs): + "Compile a regular expression pattern, returning a pattern object." + if cache_pattern is None: + cache_pattern = _cache_all + return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern) + +def purge(): + "Clear the regular expression cache" + _cache.clear() + _locale_sensitive.clear() + +# Whether to cache all patterns. +_cache_all = True + +def cache_all(value=True): + """Sets whether to cache all patterns, even those are compiled explicitly. + Passing None has no effect, but returns the current setting.""" + global _cache_all + + if value is None: + return _cache_all + + _cache_all = value + +def template(pattern, flags=0): + "Compile a template pattern, returning a pattern object." + return _compile(pattern, flags | TEMPLATE, False, {}, False) + +def escape(pattern, special_only=True, literal_spaces=False): + """Escape a string for use as a literal in a pattern. If special_only is + True, escape only special characters, else escape all non-alphanumeric + characters. If literal_spaces is True, don't escape spaces.""" + # Convert it to Unicode. + if isinstance(pattern, bytes): + p = pattern.decode("latin-1") + else: + p = pattern + + s = [] + if special_only: + for c in p: + if c == " " and literal_spaces: + s.append(c) + elif c in _METACHARS or c.isspace(): + s.append("\\") + s.append(c) + else: + s.append(c) + else: + for c in p: + if c == " " and literal_spaces: + s.append(c) + elif c in _ALNUM: + s.append(c) + else: + s.append("\\") + s.append(c) + + r = "".join(s) + # Convert it back to bytes if necessary. + if isinstance(pattern, bytes): + r = r.encode("latin-1") + + return r + +# -------------------------------------------------------------------- +# Internals. + +import regex._regex_core as _regex_core +import regex._regex as _regex +from threading import RLock as _RLock +from locale import getpreferredencoding as _getpreferredencoding +from regex._regex_core import * +from regex._regex_core import (_ALL_VERSIONS, _ALL_ENCODINGS, _FirstSetError, + _UnscopedFlagSet, _check_group_features, _compile_firstset, + _compile_replacement, _flatten_code, _fold_case, _get_required_string, + _parse_pattern, _shrink_cache) +from regex._regex_core import (ALNUM as _ALNUM, Info as _Info, OP as _OP, Source + as _Source, Fuzzy as _Fuzzy) + +# Version 0 is the old behaviour, compatible with the original 're' module. +# Version 1 is the new behaviour, which differs slightly. + +DEFAULT_VERSION = VERSION0 + +_METACHARS = frozenset("()[]{}?*+|^$\\.-#&~") + +_regex_core.DEFAULT_VERSION = DEFAULT_VERSION + +# Caches for the patterns and replacements. +_cache = {} +_cache_lock = _RLock() +_named_args = {} +_replacement_cache = {} +_locale_sensitive = {} + +# Maximum size of the cache. +_MAXCACHE = 500 +_MAXREPCACHE = 500 + +def _compile(pattern, flags, ignore_unused, kwargs, cache_it): + "Compiles a regular expression to a PatternObject." + + global DEFAULT_VERSION + try: + from regex import DEFAULT_VERSION + except ImportError: + pass + + # We won't bother to cache the pattern if we're debugging. + if (flags & DEBUG) != 0: + cache_it = False + + # What locale is this pattern using? + locale_key = (type(pattern), pattern) + if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0: + # This pattern is, or might be, locale-sensitive. + pattern_locale = _getpreferredencoding() + else: + # This pattern is definitely not locale-sensitive. + pattern_locale = None + + def complain_unused_args(): + if ignore_unused: + return + + # Complain about any unused keyword arguments, possibly resulting from a typo. + unused_kwargs = set(kwargs) - {k for k, v in args_needed} + if unused_kwargs: + any_one = next(iter(unused_kwargs)) + raise ValueError('unused keyword argument {!a}'.format(any_one)) + + if cache_it: + try: + # Do we know what keyword arguments are needed? + args_key = pattern, type(pattern), flags + args_needed = _named_args[args_key] + + # Are we being provided with its required keyword arguments? + args_supplied = set() + if args_needed: + for k, v in args_needed: + try: + args_supplied.add((k, frozenset(kwargs[k]))) + except KeyError: + raise error("missing named list: {!r}".format(k)) + + complain_unused_args() + + args_supplied = frozenset(args_supplied) + + # Have we already seen this regular expression and named list? + pattern_key = (pattern, type(pattern), flags, args_supplied, + DEFAULT_VERSION, pattern_locale) + return _cache[pattern_key] + except KeyError: + # It's a new pattern, or new named list for a known pattern. + pass + + # Guess the encoding from the class of the pattern string. + if isinstance(pattern, str): + guess_encoding = UNICODE + elif isinstance(pattern, bytes): + guess_encoding = ASCII + elif isinstance(pattern, Pattern): + if flags: + raise ValueError("cannot process flags argument with a compiled pattern") + + return pattern + else: + raise TypeError("first argument must be a string or compiled pattern") + + # Set the default version in the core code in case it has been changed. + _regex_core.DEFAULT_VERSION = DEFAULT_VERSION + + global_flags = flags + + while True: + caught_exception = None + try: + source = _Source(pattern) + info = _Info(global_flags, source.char_type, kwargs) + info.guess_encoding = guess_encoding + source.ignore_space = bool(info.flags & VERBOSE) + parsed = _parse_pattern(source, info) + break + except _UnscopedFlagSet: + # Remember the global flags for the next attempt. + global_flags = info.global_flags + except error as e: + caught_exception = e + + if caught_exception: + raise error(caught_exception.msg, caught_exception.pattern, + caught_exception.pos) + + if not source.at_end(): + raise error("unbalanced parenthesis", pattern, source.pos) + + # Check the global flags for conflicts. + version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION + if version not in (0, VERSION0, VERSION1): + raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") + + if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE): + raise ValueError("ASCII, LOCALE and UNICODE flags are mutually incompatible") + + if isinstance(pattern, bytes) and (info.flags & UNICODE): + raise ValueError("cannot use UNICODE flag with a bytes pattern") + + if not (info.flags & _ALL_ENCODINGS): + if isinstance(pattern, str): + info.flags |= UNICODE + else: + info.flags |= ASCII + + reverse = bool(info.flags & REVERSE) + fuzzy = isinstance(parsed, _Fuzzy) + + # Remember whether this pattern as an inline locale flag. + _locale_sensitive[locale_key] = info.inline_locale + + # Fix the group references. + caught_exception = None + try: + parsed.fix_groups(pattern, reverse, False) + except error as e: + caught_exception = e + + if caught_exception: + raise error(caught_exception.msg, caught_exception.pattern, + caught_exception.pos) + + # Should we print the parsed pattern? + if flags & DEBUG: + parsed.dump(indent=0, reverse=reverse) + + # Optimise the parsed pattern. + parsed = parsed.optimise(info, reverse) + parsed = parsed.pack_characters(info) + + # Get the required string. + req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags) + + # Build the named lists. + named_lists = {} + named_list_indexes = [None] * len(info.named_lists_used) + args_needed = set() + for key, index in info.named_lists_used.items(): + name, case_flags = key + values = frozenset(kwargs[name]) + if case_flags: + items = frozenset(_fold_case(info, v) for v in values) + else: + items = values + named_lists[name] = values + named_list_indexes[index] = items + args_needed.add((name, values)) + + complain_unused_args() + + # Check the features of the groups. + _check_group_features(info, parsed) + + # Compile the parsed pattern. The result is a list of tuples. + code = parsed.compile(reverse) + + # Is there a group call to the pattern as a whole? + key = (0, reverse, fuzzy) + ref = info.call_refs.get(key) + if ref is not None: + code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )] + + # Add the final 'success' opcode. + code += [(_OP.SUCCESS, )] + + # Compile the additional copies of the groups that we need. + for group, rev, fuz in info.additional_groups: + code += group.compile(rev, fuz) + + # Flatten the code into a list of ints. + code = _flatten_code(code) + + if not parsed.has_simple_start(): + # Get the first set, if possible. + try: + fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) + fs_code = _flatten_code(fs_code) + code = fs_code + code + except _FirstSetError: + pass + + # The named capture groups. + index_group = dict((v, n) for n, v in info.group_index.items()) + + # Create the PatternObject. + # + # Local flags like IGNORECASE affect the code generation, but aren't needed + # by the PatternObject itself. Conversely, global flags like LOCALE _don't_ + # affect the code generation but _are_ needed by the PatternObject. + compiled_pattern = _regex.compile(pattern, info.flags | version, code, + info.group_index, index_group, named_lists, named_list_indexes, + req_offset, req_chars, req_flags, info.group_count) + + # Do we need to reduce the size of the cache? + if len(_cache) >= _MAXCACHE: + with _cache_lock: + _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE) + + if cache_it: + if (info.flags & LOCALE) == 0: + pattern_locale = None + + args_needed = frozenset(args_needed) + + # Store this regular expression and named list. + pattern_key = (pattern, type(pattern), flags, args_needed, + DEFAULT_VERSION, pattern_locale) + _cache[pattern_key] = compiled_pattern + + # Store what keyword arguments are needed. + _named_args[args_key] = args_needed + + return compiled_pattern + +def _compile_replacement_helper(pattern, template): + "Compiles a replacement template." + # This function is called by the _regex module. + + # Have we seen this before? + key = pattern.pattern, pattern.flags, template + compiled = _replacement_cache.get(key) + if compiled is not None: + return compiled + + if len(_replacement_cache) >= _MAXREPCACHE: + _replacement_cache.clear() + + is_unicode = isinstance(template, str) + source = _Source(template) + if is_unicode: + def make_string(char_codes): + return "".join(chr(c) for c in char_codes) + else: + def make_string(char_codes): + return bytes(char_codes) + + compiled = [] + literal = [] + while True: + ch = source.get() + if not ch: + break + if ch == "\\": + # '_compile_replacement' will return either an int group reference + # or a string literal. It returns items (plural) in order to handle + # a 2-character literal (an invalid escape sequence). + is_group, items = _compile_replacement(source, pattern, is_unicode) + if is_group: + # It's a group, so first flush the literal. + if literal: + compiled.append(make_string(literal)) + literal = [] + compiled.extend(items) + else: + literal.extend(items) + else: + literal.append(ord(ch)) + + # Flush the literal. + if literal: + compiled.append(make_string(literal)) + + _replacement_cache[key] = compiled + + return compiled + +# We define Pattern here after all the support objects have been defined. +_pat = _compile('', 0, False, {}, False) +Pattern = type(_pat) +Match = type(_pat.match('')) +del _pat + +# Make Pattern public for typing annotations. +__all__.append("Pattern") +__all__.append("Match") + +# We'll define an alias for the 'compile' function so that the repr of a +# pattern object is eval-able. +Regex = compile + +# Register myself for pickling. +import copyreg as _copy_reg + +def _pickle(pattern): + return _regex.compile, pattern._pickled_data + +_copy_reg.pickle(Pattern, _pickle) diff --git a/addon/synthDrivers/regex/regex/test_regex.py b/addon/synthDrivers/regex/regex/test_regex.py new file mode 100644 index 0000000..bce5a87 --- /dev/null +++ b/addon/synthDrivers/regex/regex/test_regex.py @@ -0,0 +1,4488 @@ +from weakref import proxy +import copy +import pickle +import regex +import string +import sys +import unittest + +# String subclasses for issue 18468. +class StrSubclass(str): + def __getitem__(self, index): + return StrSubclass(super().__getitem__(index)) + +class BytesSubclass(bytes): + def __getitem__(self, index): + return BytesSubclass(super().__getitem__(index)) + +class RegexTests(unittest.TestCase): + PATTERN_CLASS = "" + FLAGS_WITH_COMPILED_PAT = "cannot process flags argument with a compiled pattern" + INVALID_GROUP_REF = "invalid group reference" + MISSING_GT = "missing >" + BAD_GROUP_NAME = "bad character in group name" + MISSING_GROUP_NAME = "missing group name" + MISSING_LT = "missing <" + UNKNOWN_GROUP_I = "unknown group" + UNKNOWN_GROUP = "unknown group" + BAD_ESCAPE = r"bad escape \(end of pattern\)" + BAD_OCTAL_ESCAPE = r"bad escape \\" + BAD_SET = "unterminated character set" + STR_PAT_ON_BYTES = "cannot use a string pattern on a bytes-like object" + BYTES_PAT_ON_STR = "cannot use a bytes pattern on a string-like object" + STR_PAT_BYTES_TEMPL = "expected str instance, bytes found" + BYTES_PAT_STR_TEMPL = "expected a bytes-like object, str found" + BYTES_PAT_UNI_FLAG = "cannot use UNICODE flag with a bytes pattern" + MIXED_FLAGS = "ASCII, LOCALE and UNICODE flags are mutually incompatible" + MISSING_RPAREN = "missing \\)" + TRAILING_CHARS = "unbalanced parenthesis" + BAD_CHAR_RANGE = "bad character range" + NOTHING_TO_REPEAT = "nothing to repeat" + MULTIPLE_REPEAT = "multiple repeat" + OPEN_GROUP = "cannot refer to an open group" + DUPLICATE_GROUP = "duplicate group" + CANT_TURN_OFF = "bad inline flags: cannot turn flags off" + UNDEF_CHAR_NAME = "undefined character name" + + def assertTypedEqual(self, actual, expect, msg=None): + self.assertEqual(actual, expect, msg) + + def recurse(actual, expect): + if isinstance(expect, (tuple, list)): + for x, y in zip(actual, expect): + recurse(x, y) + else: + self.assertIs(type(actual), type(expect), msg) + + recurse(actual, expect) + + def test_weakref(self): + s = 'QabbbcR' + x = regex.compile('ab+c') + y = proxy(x) + if x.findall('QabbbcR') != y.findall('QabbbcR'): + self.fail() + + def test_search_star_plus(self): + self.assertEqual(regex.search('a*', 'xxx').span(0), (0, 0)) + self.assertEqual(regex.search('x*', 'axx').span(), (0, 0)) + self.assertEqual(regex.search('x+', 'axx').span(0), (1, 3)) + self.assertEqual(regex.search('x+', 'axx').span(), (1, 3)) + self.assertEqual(regex.search('x', 'aaa'), None) + self.assertEqual(regex.match('a*', 'xxx').span(0), (0, 0)) + self.assertEqual(regex.match('a*', 'xxx').span(), (0, 0)) + self.assertEqual(regex.match('x*', 'xxxa').span(0), (0, 3)) + self.assertEqual(regex.match('x*', 'xxxa').span(), (0, 3)) + self.assertEqual(regex.match('a+', 'xxx'), None) + + def bump_num(self, matchobj): + int_value = int(matchobj[0]) + return str(int_value + 1) + + def test_basic_regex_sub(self): + self.assertEqual(regex.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') + self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), + '9.3 -3 24x100y') + self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), + '9.3 -3 23x99y') + + self.assertEqual(regex.sub('.', lambda m: r"\n", 'x'), "\\n") + self.assertEqual(regex.sub('.', r"\n", 'x'), "\n") + + self.assertEqual(regex.sub('(?Px)', r'\g\g', 'xx'), 'xxxx') + self.assertEqual(regex.sub('(?Px)', r'\g\g<1>', 'xx'), 'xxxx') + self.assertEqual(regex.sub('(?Px)', r'\g\g', 'xx'), + 'xxxx') + self.assertEqual(regex.sub('(?Px)', r'\g<1>\g<1>', 'xx'), 'xxxx') + + self.assertEqual(regex.sub('a', r'\t\n\v\r\f\a\b', 'a'), "\t\n\v\r\f\a\b") + self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), "\t\n\v\r\f\a") + self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), chr(9) + chr(10) + + chr(11) + chr(13) + chr(12) + chr(7)) + + self.assertEqual(regex.sub(r'^\s*', 'X', 'test'), 'Xtest') + + self.assertEqual(regex.sub(r"x", r"\x0A", "x"), "\n") + self.assertEqual(regex.sub(r"x", r"\u000A", "x"), "\n") + self.assertEqual(regex.sub(r"x", r"\U0000000A", "x"), "\n") + self.assertEqual(regex.sub(r"x", r"\N{LATIN CAPITAL LETTER A}", + "x"), "A") + + self.assertEqual(regex.sub(br"x", br"\x0A", b"x"), b"\n") + + def test_bug_449964(self): + # Fails for group followed by other escape. + self.assertEqual(regex.sub(r'(?Px)', r'\g<1>\g<1>\b', 'xx'), + "xx\bxx\b") + + def test_bug_449000(self): + # Test for sub() on escaped characters. + self.assertEqual(regex.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + self.assertEqual(regex.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + self.assertEqual(regex.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + self.assertEqual(regex.sub('\r\n', '\n', 'abc\r\ndef\r\n'), + "abc\ndef\n") + + def test_bug_1661(self): + # Verify that flags do not get silently ignored with compiled patterns + pattern = regex.compile('.') + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.match(pattern, 'A', regex.I)) + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.search(pattern, 'A', regex.I)) + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.findall(pattern, 'A', regex.I)) + self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, + lambda: regex.compile(pattern, regex.I)) + + def test_bug_3629(self): + # A regex that triggered a bug in the sre-code validator + self.assertEqual(repr(type(regex.compile("(?P)(?(quote))"))), + self.PATTERN_CLASS) + + def test_sub_template_numeric_escape(self): + # Bug 776311 and friends. + self.assertEqual(regex.sub('x', r'\0', 'x'), "\0") + self.assertEqual(regex.sub('x', r'\000', 'x'), "\000") + self.assertEqual(regex.sub('x', r'\001', 'x'), "\001") + self.assertEqual(regex.sub('x', r'\008', 'x'), "\0" + "8") + self.assertEqual(regex.sub('x', r'\009', 'x'), "\0" + "9") + self.assertEqual(regex.sub('x', r'\111', 'x'), "\111") + self.assertEqual(regex.sub('x', r'\117', 'x'), "\117") + + self.assertEqual(regex.sub('x', r'\1111', 'x'), "\1111") + self.assertEqual(regex.sub('x', r'\1111', 'x'), "\111" + "1") + + self.assertEqual(regex.sub('x', r'\00', 'x'), '\x00') + self.assertEqual(regex.sub('x', r'\07', 'x'), '\x07') + self.assertEqual(regex.sub('x', r'\08', 'x'), "\0" + "8") + self.assertEqual(regex.sub('x', r'\09', 'x'), "\0" + "9") + self.assertEqual(regex.sub('x', r'\0a', 'x'), "\0" + "a") + + self.assertEqual(regex.sub('x', r'\400', 'x'), "\u0100") + self.assertEqual(regex.sub('x', r'\777', 'x'), "\u01FF") + self.assertEqual(regex.sub(b'x', br'\400', b'x'), b"\x00") + self.assertEqual(regex.sub(b'x', br'\777', b'x'), b"\xFF") + + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\1', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\8', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\9', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\11', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\18', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\1a', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\90', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\99', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\118', 'x')) # r'\11' + '8' + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\11a', 'x')) + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\181', 'x')) # r'\18' + '1' + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.sub('x', r'\800', 'x')) # r'\80' + '0' + + # In Python 2.3 (etc), these loop endlessly in sre_parser.py. + self.assertEqual(regex.sub('(((((((((((x)))))))))))', r'\11', 'x'), + 'x') + self.assertEqual(regex.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), + 'xz8') + self.assertEqual(regex.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), + 'xza') + + def test_qualified_re_sub(self): + self.assertEqual(regex.sub('a', 'b', 'aaaaa'), 'bbbbb') + self.assertEqual(regex.sub('a', 'b', 'aaaaa', 1), 'baaaa') + + def test_bug_114660(self): + self.assertEqual(regex.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), + 'hello there') + + def test_bug_462270(self): + # Test for empty sub() behaviour, see SF bug #462270 + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.sub('(?V0)x*', '-', 'abxd'), '-a-b--d-') + else: + self.assertEqual(regex.sub('(?V0)x*', '-', 'abxd'), '-a-b-d-') + self.assertEqual(regex.sub('(?V1)x*', '-', 'abxd'), '-a-b--d-') + self.assertEqual(regex.sub('x+', '-', 'abxd'), 'ab-d') + + def test_bug_14462(self): + # chr(255) is a valid identifier in Python 3. + group_name = '\xFF' + self.assertEqual(regex.search(r'(?P<' + group_name + '>a)', + 'abc').group(group_name), 'a') + + def test_symbolic_refs(self): + self.assertRaisesRegex(regex.error, self.MISSING_GT, lambda: + regex.sub('(?Px)', r'\gx)', r'\g<', 'xx')) + self.assertRaisesRegex(regex.error, self.MISSING_LT, lambda: + regex.sub('(?Px)', r'\g', 'xx')) + self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: + regex.sub('(?Px)', r'\g', 'xx')) + self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: + regex.sub('(?Px)', r'\g<1a1>', 'xx')) + self.assertRaisesRegex(IndexError, self.UNKNOWN_GROUP_I, lambda: + regex.sub('(?Px)', r'\g', 'xx')) + + # The new behaviour of unmatched but valid groups is to treat them like + # empty matches in the replacement template, like in Perl. + self.assertEqual(regex.sub('(?Px)|(?Py)', r'\g', 'xx'), '') + self.assertEqual(regex.sub('(?Px)|(?Py)', r'\2', 'xx'), '') + + # The old behaviour was to raise it as an IndexError. + self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: + regex.sub('(?Px)', r'\g<-1>', 'xx')) + + def test_re_subn(self): + self.assertEqual(regex.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) + self.assertEqual(regex.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) + self.assertEqual(regex.subn("b+", "x", "xyz"), ('xyz', 0)) + self.assertEqual(regex.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) + self.assertEqual(regex.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) + + def test_re_split(self): + self.assertEqual(regex.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.split(":*", ":a:b::c"), ['', '', 'a', '', + 'b', '', 'c', '']) + self.assertEqual(regex.split("(:*)", ":a:b::c"), ['', ':', '', '', + 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']) + self.assertEqual(regex.split("(?::*)", ":a:b::c"), ['', '', 'a', + '', 'b', '', 'c', '']) + self.assertEqual(regex.split("(:)*", ":a:b::c"), ['', ':', '', + None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']) + else: + self.assertEqual(regex.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(regex.split("(:*)", ":a:b::c"), ['', ':', 'a', + ':', 'b', '::', 'c']) + self.assertEqual(regex.split("(?::*)", ":a:b::c"), ['', 'a', 'b', + 'c']) + self.assertEqual(regex.split("(:)*", ":a:b::c"), ['', ':', 'a', + ':', 'b', ':', 'c']) + self.assertEqual(regex.split("([b:]+)", ":a:b::c"), ['', ':', 'a', + ':b::', 'c']) + self.assertEqual(regex.split("(b)|(:+)", ":a:b::c"), ['', None, ':', + 'a', None, ':', '', 'b', None, '', None, '::', 'c']) + self.assertEqual(regex.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', + '', 'c']) + + self.assertEqual(regex.split("x", "xaxbxc"), ['', 'a', 'b', 'c']) + self.assertEqual([m for m in regex.splititer("x", "xaxbxc")], ['', 'a', + 'b', 'c']) + + self.assertEqual(regex.split("(?r)x", "xaxbxc"), ['c', 'b', 'a', '']) + self.assertEqual([m for m in regex.splititer("(?r)x", "xaxbxc")], ['c', + 'b', 'a', '']) + + self.assertEqual(regex.split("(x)|(y)", "xaxbxc"), ['', 'x', None, 'a', + 'x', None, 'b', 'x', None, 'c']) + self.assertEqual([m for m in regex.splititer("(x)|(y)", "xaxbxc")], + ['', 'x', None, 'a', 'x', None, 'b', 'x', None, 'c']) + + self.assertEqual(regex.split("(?r)(x)|(y)", "xaxbxc"), ['c', 'x', None, + 'b', 'x', None, 'a', 'x', None, '']) + self.assertEqual([m for m in regex.splititer("(?r)(x)|(y)", "xaxbxc")], + ['c', 'x', None, 'b', 'x', None, 'a', 'x', None, '']) + + self.assertEqual(regex.split(r"(?V1)\b", "a b c"), ['', 'a', ' ', 'b', + ' ', 'c', '']) + self.assertEqual(regex.split(r"(?V1)\m", "a b c"), ['', 'a ', 'b ', + 'c']) + self.assertEqual(regex.split(r"(?V1)\M", "a b c"), ['a', ' b', ' c', + '']) + + def test_qualified_re_split(self): + self.assertEqual(regex.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) + self.assertEqual(regex.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) + self.assertEqual(regex.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':', + 'b::c']) + + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.split("(:*)", ":a:b::c", 2), ['', ':', '', + '', 'a:b::c']) + else: + self.assertEqual(regex.split("(:*)", ":a:b::c", 2), ['', ':', 'a', + ':', 'b::c']) + + def test_re_findall(self): + self.assertEqual(regex.findall(":+", "abc"), []) + self.assertEqual(regex.findall(":+", "a:b::c:::d"), [':', '::', ':::']) + self.assertEqual(regex.findall("(:+)", "a:b::c:::d"), [':', '::', + ':::']) + self.assertEqual(regex.findall("(:)(:*)", "a:b::c:::d"), [(':', ''), + (':', ':'), (':', '::')]) + + self.assertEqual(regex.findall(r"\((?P.{0,5}?TEST)\)", + "(MY TEST)"), ["MY TEST"]) + self.assertEqual(regex.findall(r"\((?P.{0,3}?TEST)\)", + "(MY TEST)"), ["MY TEST"]) + self.assertEqual(regex.findall(r"\((?P.{0,3}?T)\)", "(MY T)"), + ["MY T"]) + + self.assertEqual(regex.findall(r"[^a]{2}[A-Z]", "\n S"), [' S']) + self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), ['\n S']) + self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), [' S']) + + self.assertEqual(regex.findall(r"X(Y[^Y]+?){1,2}( |Q)+DEF", + "XYABCYPPQ\nQ DEF"), [('YPPQ\n', ' ')]) + + self.assertEqual(regex.findall(r"(\nTest(\n+.+?){0,2}?)?\n+End", + "\nTest\nxyz\nxyz\nEnd"), [('\nTest\nxyz\nxyz', '\nxyz')]) + + def test_bug_117612(self): + self.assertEqual(regex.findall(r"(a|(b))", "aba"), [('a', ''), ('b', + 'b'), ('a', '')]) + + def test_re_match(self): + self.assertEqual(regex.match('a', 'a')[:], ('a',)) + self.assertEqual(regex.match('(a)', 'a')[:], ('a', 'a')) + self.assertEqual(regex.match(r'(a)', 'a')[0], 'a') + self.assertEqual(regex.match(r'(a)', 'a')[1], 'a') + self.assertEqual(regex.match(r'(a)', 'a').group(1, 1), ('a', 'a')) + + pat = regex.compile('((a)|(b))(c)?') + self.assertEqual(pat.match('a')[:], ('a', 'a', 'a', None, None)) + self.assertEqual(pat.match('b')[:], ('b', 'b', None, 'b', None)) + self.assertEqual(pat.match('ac')[:], ('ac', 'a', 'a', None, 'c')) + self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) + self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) + + # A single group. + m = regex.match('(a)', 'a') + self.assertEqual(m.group(), 'a') + self.assertEqual(m.group(0), 'a') + self.assertEqual(m.group(1), 'a') + self.assertEqual(m.group(1, 1), ('a', 'a')) + + pat = regex.compile('(?:(?Pa)|(?Pb))(?Pc)?') + self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) + self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), (None, 'b', + None)) + self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) + + def test_re_groupref_exists(self): + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a)')[:], + ('(a)', '(', 'a')) + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a')[:], ('a', + None, 'a')) + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'), None) + self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a'), None) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'ab')[:], ('ab', + 'a', 'b')) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'cd')[:], ('cd', + None, 'd')) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'cd')[:], ('cd', + None, 'd')) + self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'a')[:], ('a', + 'a', '')) + + # Tests for bug #1177831: exercise groups other than the first group. + p = regex.compile('(?Pa)(?Pb)?((?(g2)c|d))') + self.assertEqual(p.match('abc')[:], ('abc', 'a', 'b', 'c')) + self.assertEqual(p.match('ad')[:], ('ad', 'a', None, 'd')) + self.assertEqual(p.match('abd'), None) + self.assertEqual(p.match('ac'), None) + + def test_re_groupref(self): + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a|')[:], ('|a|', + '|', 'a')) + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1?$', 'a')[:], ('a', + None, 'a')) + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', 'a|'), None) + self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a'), None) + self.assertEqual(regex.match(r'^(?:(a)|c)(\1)$', 'aa')[:], ('aa', 'a', + 'a')) + self.assertEqual(regex.match(r'^(?:(a)|c)(\1)?$', 'c')[:], ('c', None, + None)) + + self.assertEqual(regex.findall(r"(?i)(.{1,40}?),(.{1,40}?)(?:;)+(.{1,80}).{1,40}?\3(\ |;)+(.{1,80}?)\1", + "TEST, BEST; LEST ; Lest 123 Test, Best"), [('TEST', ' BEST', + ' LEST', ' ', '123 ')]) + + def test_groupdict(self): + self.assertEqual(regex.match('(?Pfirst) (?Psecond)', + 'first second').groupdict(), {'first': 'first', 'second': 'second'}) + + def test_expand(self): + self.assertEqual(regex.match("(?Pfirst) (?Psecond)", + "first second").expand(r"\2 \1 \g \g"), + 'second first second first') + + def test_repeat_minmax(self): + self.assertEqual(regex.match(r"^(\w){1}$", "abc"), None) + self.assertEqual(regex.match(r"^(\w){1}?$", "abc"), None) + self.assertEqual(regex.match(r"^(\w){1,2}$", "abc"), None) + self.assertEqual(regex.match(r"^(\w){1,2}?$", "abc"), None) + + self.assertEqual(regex.match(r"^(\w){3}$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,3}$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,4}$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){3}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,3}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){1,4}?$", "abc")[1], 'c') + self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') + + self.assertEqual(regex.match("^x{1}$", "xxx"), None) + self.assertEqual(regex.match("^x{1}?$", "xxx"), None) + self.assertEqual(regex.match("^x{1,2}$", "xxx"), None) + self.assertEqual(regex.match("^x{1,2}?$", "xxx"), None) + + self.assertEqual(regex.match("^x{1}", "xxx")[0], 'x') + self.assertEqual(regex.match("^x{1}?", "xxx")[0], 'x') + self.assertEqual(regex.match("^x{0,1}", "xxx")[0], 'x') + self.assertEqual(regex.match("^x{0,1}?", "xxx")[0], '') + + self.assertEqual(bool(regex.match("^x{3}$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,3}$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,4}$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{3}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,3}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{1,4}?$", "xxx")), True) + self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) + + self.assertEqual(regex.match("^x{}$", "xxx"), None) + self.assertEqual(bool(regex.match("^x{}$", "x{}")), True) + + def test_getattr(self): + self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)') + self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.I | regex.U | + regex.DEFAULT_VERSION) + self.assertEqual(regex.compile(b"(?i)(a)(b)").flags, regex.A | regex.I + | regex.DEFAULT_VERSION) + self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2) + self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {}) + + self.assertEqual(regex.compile("(?i)(?Pa)(?Pb)").groupindex, + {'first': 1, 'other': 2}) + + self.assertEqual(regex.match("(a)", "a").pos, 0) + self.assertEqual(regex.match("(a)", "a").endpos, 1) + + self.assertEqual(regex.search("b(c)", "abcdef").pos, 0) + self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6) + self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3)) + self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3)) + + self.assertEqual(regex.match("(a)", "a").string, 'a') + self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1))) + self.assertEqual(repr(type(regex.match("(a)", "a").re)), + self.PATTERN_CLASS) + + # Issue 14260. + p = regex.compile(r'abc(?Pdef)') + p.groupindex["n"] = 0 + self.assertEqual(p.groupindex["n"], 1) + + def test_special_escapes(self): + self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx")[1], 'bx') + self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd")[1], 'bx') + self.assertEqual(regex.search(br"\b(b.)\b", b"abcd abc bcd bx", + regex.LOCALE)[1], b'bx') + self.assertEqual(regex.search(br"\B(b.)\B", b"abc bcd bc abxd", + regex.LOCALE)[1], b'bx') + self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx", + regex.UNICODE)[1], 'bx') + self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd", + regex.UNICODE)[1], 'bx') + + self.assertEqual(regex.search(r"^abc$", "\nabc\n", regex.M)[0], 'abc') + self.assertEqual(regex.search(r"^\Aabc\Z$", "abc", regex.M)[0], 'abc') + self.assertEqual(regex.search(r"^\Aabc\Z$", "\nabc\n", regex.M), None) + + self.assertEqual(regex.search(br"\b(b.)\b", b"abcd abc bcd bx")[1], + b'bx') + self.assertEqual(regex.search(br"\B(b.)\B", b"abc bcd bc abxd")[1], + b'bx') + self.assertEqual(regex.search(br"^abc$", b"\nabc\n", regex.M)[0], + b'abc') + self.assertEqual(regex.search(br"^\Aabc\Z$", b"abc", regex.M)[0], + b'abc') + self.assertEqual(regex.search(br"^\Aabc\Z$", b"\nabc\n", regex.M), + None) + + self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a")[0], '1aa! a') + self.assertEqual(regex.search(br"\d\D\w\W\s\S", b"1aa! a", + regex.LOCALE)[0], b'1aa! a') + self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a", + regex.UNICODE)[0], '1aa! a') + + def test_bigcharset(self): + self.assertEqual(regex.match(r"([\u2222\u2223])", "\u2222")[1], + '\u2222') + self.assertEqual(regex.match(r"([\u2222\u2223])", "\u2222", + regex.UNICODE)[1], '\u2222') + self.assertEqual("".join(regex.findall(".", + "e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), + 'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') + self.assertEqual("".join(regex.findall(r"[e\xe8\xe9\xea\xeb\u0113\u011b\u0117]", + "e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), + 'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') + self.assertEqual("".join(regex.findall(r"e|\xe8|\xe9|\xea|\xeb|\u0113|\u011b|\u0117", + "e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), + 'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') + + def test_anyall(self): + self.assertEqual(regex.match("a.b", "a\nb", regex.DOTALL)[0], "a\nb") + self.assertEqual(regex.match("a.*b", "a\n\nb", regex.DOTALL)[0], + "a\n\nb") + + def test_non_consuming(self): + self.assertEqual(regex.match(r"(a(?=\s[^a]))", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a(?=\s[^a]*))", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a(?=\s[abc]))", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a(?=\s[abc]*))", "a bc")[1], 'a') + self.assertEqual(regex.match(r"(a)(?=\s\1)", "a a")[1], 'a') + self.assertEqual(regex.match(r"(a)(?=\s\1*)", "a aa")[1], 'a') + self.assertEqual(regex.match(r"(a)(?=\s(abc|a))", "a a")[1], 'a') + + self.assertEqual(regex.match(r"(a(?!\s[^a]))", "a a")[1], 'a') + self.assertEqual(regex.match(r"(a(?!\s[abc]))", "a d")[1], 'a') + self.assertEqual(regex.match(r"(a)(?!\s\1)", "a b")[1], 'a') + self.assertEqual(regex.match(r"(a)(?!\s(abc|a))", "a b")[1], 'a') + + def test_ignore_case(self): + self.assertEqual(regex.match("abc", "ABC", regex.I)[0], 'ABC') + self.assertEqual(regex.match(b"abc", b"ABC", regex.I)[0], b'ABC') + + self.assertEqual(regex.match(r"(a\s[^a]*)", "a bb", regex.I)[1], + 'a bb') + self.assertEqual(regex.match(r"(a\s[abc])", "a b", regex.I)[1], 'a b') + self.assertEqual(regex.match(r"(a\s[abc]*)", "a bb", regex.I)[1], + 'a bb') + self.assertEqual(regex.match(r"((a)\s\2)", "a a", regex.I)[1], 'a a') + self.assertEqual(regex.match(r"((a)\s\2*)", "a aa", regex.I)[1], + 'a aa') + self.assertEqual(regex.match(r"((a)\s(abc|a))", "a a", regex.I)[1], + 'a a') + self.assertEqual(regex.match(r"((a)\s(abc|a)*)", "a aa", regex.I)[1], + 'a aa') + + # Issue 3511. + self.assertEqual(regex.match(r"[Z-a]", "_").span(), (0, 1)) + self.assertEqual(regex.match(r"(?i)[Z-a]", "_").span(), (0, 1)) + + self.assertEqual(bool(regex.match(r"(?i)nao", "nAo")), True) + self.assertEqual(bool(regex.match(r"(?i)n\xE3o", "n\xC3o")), True) + self.assertEqual(bool(regex.match(r"(?i)n\xE3o", "N\xC3O")), True) + self.assertEqual(bool(regex.match(r"(?i)s", "\u017F")), True) + + def test_case_folding(self): + self.assertEqual(regex.search(r"(?fi)ss", "SS").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)SS", "ss").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)SS", + "\N{LATIN SMALL LETTER SHARP S}").span(), (0, 1)) + self.assertEqual(regex.search(r"(?fi)\N{LATIN SMALL LETTER SHARP S}", + "SS").span(), (0, 2)) + + self.assertEqual(regex.search(r"(?fi)\N{LATIN SMALL LIGATURE ST}", + "ST").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)ST", + "\N{LATIN SMALL LIGATURE ST}").span(), (0, 1)) + self.assertEqual(regex.search(r"(?fi)ST", + "\N{LATIN SMALL LIGATURE LONG S T}").span(), (0, 1)) + + self.assertEqual(regex.search(r"(?fi)SST", + "\N{LATIN SMALL LETTER SHARP S}t").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)SST", + "s\N{LATIN SMALL LIGATURE LONG S T}").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)SST", + "s\N{LATIN SMALL LIGATURE ST}").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)\N{LATIN SMALL LIGATURE ST}", + "SST").span(), (1, 3)) + self.assertEqual(regex.search(r"(?fi)SST", + "s\N{LATIN SMALL LIGATURE ST}").span(), (0, 2)) + + self.assertEqual(regex.search(r"(?fi)FFI", + "\N{LATIN SMALL LIGATURE FFI}").span(), (0, 1)) + self.assertEqual(regex.search(r"(?fi)FFI", + "\N{LATIN SMALL LIGATURE FF}i").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)FFI", + "f\N{LATIN SMALL LIGATURE FI}").span(), (0, 2)) + self.assertEqual(regex.search(r"(?fi)\N{LATIN SMALL LIGATURE FFI}", + "FFI").span(), (0, 3)) + self.assertEqual(regex.search(r"(?fi)\N{LATIN SMALL LIGATURE FF}i", + "FFI").span(), (0, 3)) + self.assertEqual(regex.search(r"(?fi)f\N{LATIN SMALL LIGATURE FI}", + "FFI").span(), (0, 3)) + + sigma = "\u03A3\u03C3\u03C2" + for ch1 in sigma: + for ch2 in sigma: + if not regex.match(r"(?fi)" + ch1, ch2): + self.fail() + + self.assertEqual(bool(regex.search(r"(?iV1)ff", "\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)ff", "\uFB01\uFB00")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)fi", "\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)fi", "\uFB01\uFB00")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)fffi", "\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)f\uFB03", + "\uFB00\uFB01")), True) + self.assertEqual(bool(regex.search(r"(?iV1)ff", "\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)fi", "\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)fffi", "\uFB00\uFB01")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)f\uFB03", + "\uFB00\uFB01")), True) + self.assertEqual(bool(regex.search(r"(?iV1)f\uFB01", "\uFB00i")), + True) + self.assertEqual(bool(regex.search(r"(?iV1)f\uFB01", "\uFB00i")), + True) + + self.assertEqual(regex.findall(r"(?iV0)\m(?:word){e<=3}\M(?ne", "affine", + options=["\N{LATIN SMALL LIGATURE FFI}"]).span(), (0, 6)) + self.assertEqual(regex.search(r"(?fi)a\Lne", + "a\N{LATIN SMALL LIGATURE FFI}ne", options=["ffi"]).span(), (0, 4)) + + def test_category(self): + self.assertEqual(regex.match(r"(\s)", " ")[1], ' ') + + def test_not_literal(self): + self.assertEqual(regex.search(r"\s([^a])", " b")[1], 'b') + self.assertEqual(regex.search(r"\s([^a]*)", " bb")[1], 'bb') + + def test_search_coverage(self): + self.assertEqual(regex.search(r"\s(b)", " b")[1], 'b') + self.assertEqual(regex.search(r"a\s", "a ")[0], 'a ') + + def test_re_escape(self): + p = "" + self.assertEqual(regex.escape(p), p) + for i in range(0, 256): + p += chr(i) + self.assertEqual(bool(regex.match(regex.escape(chr(i)), chr(i))), + True) + self.assertEqual(regex.match(regex.escape(chr(i)), chr(i)).span(), + (0, 1)) + + pat = regex.compile(regex.escape(p)) + self.assertEqual(pat.match(p).span(), (0, 256)) + + def test_re_escape_byte(self): + p = b"" + self.assertEqual(regex.escape(p), p) + for i in range(0, 256): + b = bytes([i]) + p += b + self.assertEqual(bool(regex.match(regex.escape(b), b)), True) + self.assertEqual(regex.match(regex.escape(b), b).span(), (0, 1)) + + pat = regex.compile(regex.escape(p)) + self.assertEqual(pat.match(p).span(), (0, 256)) + + def test_constants(self): + if regex.I != regex.IGNORECASE: + self.fail() + if regex.L != regex.LOCALE: + self.fail() + if regex.M != regex.MULTILINE: + self.fail() + if regex.S != regex.DOTALL: + self.fail() + if regex.X != regex.VERBOSE: + self.fail() + + def test_flags(self): + for flag in [regex.I, regex.M, regex.X, regex.S, regex.L]: + self.assertEqual(repr(type(regex.compile('^pattern$', flag))), + self.PATTERN_CLASS) + + def test_sre_character_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255]: + self.assertEqual(bool(regex.match(r"\%03o" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"\%03o0" % i, chr(i) + "0")), + True) + self.assertEqual(bool(regex.match(r"\%03o8" % i, chr(i) + "8")), + True) + self.assertEqual(bool(regex.match(r"\x%02x" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"\x%02x0" % i, chr(i) + "0")), + True) + self.assertEqual(bool(regex.match(r"\x%02xz" % i, chr(i) + "z")), + True) + + self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: + regex.match(r"\911", "")) + + def test_sre_character_class_literals(self): + for i in [0, 8, 16, 32, 64, 127, 128, 255]: + self.assertEqual(bool(regex.match(r"[\%03o]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\%03o0]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\%03o8]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\x%02x]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\x%02x0]" % i, chr(i))), True) + self.assertEqual(bool(regex.match(r"[\x%02xz]" % i, chr(i))), True) + + self.assertRaisesRegex(regex.error, self.BAD_OCTAL_ESCAPE, lambda: + regex.match(r"[\911]", "")) + + def test_bug_113254(self): + self.assertEqual(regex.match(r'(a)|(b)', 'b').start(1), -1) + self.assertEqual(regex.match(r'(a)|(b)', 'b').end(1), -1) + self.assertEqual(regex.match(r'(a)|(b)', 'b').span(1), (-1, -1)) + + def test_bug_527371(self): + # Bug described in patches 527371/672491. + self.assertEqual(regex.match(r'(a)?a','a').lastindex, None) + self.assertEqual(regex.match(r'(a)(b)?b','ab').lastindex, 1) + self.assertEqual(regex.match(r'(?Pa)(?Pb)?b','ab').lastgroup, + 'a') + self.assertEqual(regex.match("(?Pa(b))", "ab").lastgroup, 'a') + self.assertEqual(regex.match("((a))", "a").lastindex, 1) + + def test_bug_545855(self): + # Bug 545855 -- This pattern failed to cause a compile error as it + # should, instead provoking a TypeError. + self.assertRaisesRegex(regex.error, self.BAD_SET, lambda: + regex.compile('foo[a-')) + + def test_bug_418626(self): + # Bugs 418626 at al. -- Testing Greg Chapman's addition of op code + # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of + # pattern '*?' on a long string. + self.assertEqual(regex.match('.*?c', 10000 * 'ab' + 'cd').end(0), + 20001) + self.assertEqual(regex.match('.*?cd', 5000 * 'ab' + 'c' + 5000 * 'ab' + + 'cde').end(0), 20003) + self.assertEqual(regex.match('.*?cd', 20000 * 'abc' + 'de').end(0), + 60001) + # Non-simple '*?' still used to hit the recursion limit, before the + # non-recursive scheme was implemented. + self.assertEqual(regex.search('(a|b)*?c', 10000 * 'ab' + 'cd').end(0), + 20001) + + def test_bug_612074(self): + pat = "[" + regex.escape("\u2039") + "]" + self.assertEqual(regex.compile(pat) and 1, 1) + + def test_stack_overflow(self): + # Nasty cases that used to overflow the straightforward recursive + # implementation of repeated groups. + self.assertEqual(regex.match('(x)*', 50000 * 'x')[1], 'x') + self.assertEqual(regex.match('(x)*y', 50000 * 'x' + 'y')[1], 'x') + self.assertEqual(regex.match('(x)*?y', 50000 * 'x' + 'y')[1], 'x') + + def test_scanner(self): + def s_ident(scanner, token): return token + def s_operator(scanner, token): return "op%s" % token + def s_float(scanner, token): return float(token) + def s_int(scanner, token): return int(token) + + scanner = regex.Scanner([(r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", + s_float), (r"\d+", s_int), (r"=|\+|-|\*|/", s_operator), (r"\s+", + None), ]) + + self.assertEqual(repr(type(scanner.scanner.scanner("").pattern)), + self.PATTERN_CLASS) + + self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), (['sum', + 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], '')) + + def test_bug_448951(self): + # Bug 448951 (similar to 429357, but with single char match). + # (Also test greedy matches.) + for op in '', '?', '*': + self.assertEqual(regex.match(r'((.%s):)?z' % op, 'z')[:], ('z', + None, None)) + self.assertEqual(regex.match(r'((.%s):)?z' % op, 'a:z')[:], ('a:z', + 'a:', 'a')) + + def test_bug_725106(self): + # Capturing groups in alternatives in repeats. + self.assertEqual(regex.match('^((a)|b)*', 'abc')[:], ('ab', 'b', 'a')) + self.assertEqual(regex.match('^(([ab])|c)*', 'abc')[:], ('abc', 'c', + 'b')) + self.assertEqual(regex.match('^((d)|[ab])*', 'abc')[:], ('ab', 'b', + None)) + self.assertEqual(regex.match('^((a)c|[ab])*', 'abc')[:], ('ab', 'b', + None)) + self.assertEqual(regex.match('^((a)|b)*?c', 'abc')[:], ('abc', 'b', + 'a')) + self.assertEqual(regex.match('^(([ab])|c)*?d', 'abcd')[:], ('abcd', + 'c', 'b')) + self.assertEqual(regex.match('^((d)|[ab])*?c', 'abc')[:], ('abc', 'b', + None)) + self.assertEqual(regex.match('^((a)c|[ab])*?c', 'abc')[:], ('abc', 'b', + None)) + + def test_bug_725149(self): + # Mark_stack_base restoring before restoring marks. + self.assertEqual(regex.match('(a)(?:(?=(b)*)c)*', 'abb')[:], ('a', 'a', + None)) + self.assertEqual(regex.match('(a)((?!(b)*))*', 'abb')[:], ('a', 'a', + None, None)) + + def test_bug_764548(self): + # Bug 764548, regex.compile() barfs on str/unicode subclasses. + class my_unicode(str): pass + pat = regex.compile(my_unicode("abc")) + self.assertEqual(pat.match("xyz"), None) + + def test_finditer(self): + it = regex.finditer(r":+", "a:b::c:::d") + self.assertEqual([item[0] for item in it], [':', '::', ':::']) + + def test_bug_926075(self): + if regex.compile('bug_926075') is regex.compile(b'bug_926075'): + self.fail() + + def test_bug_931848(self): + pattern = "[\u002E\u3002\uFF0E\uFF61]" + self.assertEqual(regex.compile(pattern).split("a.b.c"), ['a', 'b', + 'c']) + + def test_bug_581080(self): + it = regex.finditer(r"\s", "a b") + self.assertEqual(next(it).span(), (1, 2)) + self.assertRaises(StopIteration, lambda: next(it)) + + scanner = regex.compile(r"\s").scanner("a b") + self.assertEqual(scanner.search().span(), (1, 2)) + self.assertEqual(scanner.search(), None) + + def test_bug_817234(self): + it = regex.finditer(r".*", "asdf") + self.assertEqual(next(it).span(), (0, 4)) + self.assertEqual(next(it).span(), (4, 4)) + self.assertRaises(StopIteration, lambda: next(it)) + + def test_empty_array(self): + # SF buf 1647541. + import array + for typecode in 'bBuhHiIlLfd': + a = array.array(typecode) + self.assertEqual(regex.compile(b"bla").match(a), None) + self.assertEqual(regex.compile(b"").match(a)[1 : ], ()) + + def test_inline_flags(self): + # Bug #1700. + upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Below + lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Below + + p = regex.compile(upper_char, regex.I | regex.U) + self.assertEqual(bool(p.match(lower_char)), True) + + p = regex.compile(lower_char, regex.I | regex.U) + self.assertEqual(bool(p.match(upper_char)), True) + + p = regex.compile('(?i)' + upper_char, regex.U) + self.assertEqual(bool(p.match(lower_char)), True) + + p = regex.compile('(?i)' + lower_char, regex.U) + self.assertEqual(bool(p.match(upper_char)), True) + + p = regex.compile('(?iu)' + upper_char) + self.assertEqual(bool(p.match(lower_char)), True) + + p = regex.compile('(?iu)' + lower_char) + self.assertEqual(bool(p.match(upper_char)), True) + + # Changed to positional flags in regex 2023.12.23. + self.assertEqual(bool(regex.match(r"(?i)a", "A")), True) + self.assertEqual(regex.match(r"a(?i)", "A"), None) + + def test_dollar_matches_twice(self): + # $ matches the end of string, and just before the terminating \n. + pattern = regex.compile('$') + self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') + self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') + self.assertEqual(pattern.sub('#', '\n'), '#\n#') + + pattern = regex.compile('$', regex.MULTILINE) + self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#') + self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') + self.assertEqual(pattern.sub('#', '\n'), '#\n#') + + def test_bytes_str_mixing(self): + # Mixing str and bytes is disallowed. + pat = regex.compile('.') + bpat = regex.compile(b'.') + self.assertRaisesRegex(TypeError, self.STR_PAT_ON_BYTES, lambda: + pat.match(b'b')) + self.assertRaisesRegex(TypeError, self.BYTES_PAT_ON_STR, lambda: + bpat.match('b')) + self.assertRaisesRegex(TypeError, self.STR_PAT_BYTES_TEMPL, lambda: + pat.sub(b'b', 'c')) + self.assertRaisesRegex(TypeError, self.STR_PAT_ON_BYTES, lambda: + pat.sub('b', b'c')) + self.assertRaisesRegex(TypeError, self.STR_PAT_ON_BYTES, lambda: + pat.sub(b'b', b'c')) + self.assertRaisesRegex(TypeError, self.BYTES_PAT_ON_STR, lambda: + bpat.sub(b'b', 'c')) + self.assertRaisesRegex(TypeError, self.BYTES_PAT_STR_TEMPL, lambda: + bpat.sub('b', b'c')) + self.assertRaisesRegex(TypeError, self.BYTES_PAT_ON_STR, lambda: + bpat.sub('b', 'c')) + + self.assertRaisesRegex(ValueError, self.BYTES_PAT_UNI_FLAG, lambda: + regex.compile(br'\w', regex.UNICODE)) + self.assertRaisesRegex(ValueError, self.BYTES_PAT_UNI_FLAG, lambda: + regex.compile(br'(?u)\w')) + self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: + regex.compile(r'\w', regex.UNICODE | regex.ASCII)) + self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: + regex.compile(r'(?u)\w', regex.ASCII)) + self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: + regex.compile(r'(?a)\w', regex.UNICODE)) + self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: + regex.compile(r'(?au)\w')) + + def test_ascii_and_unicode_flag(self): + # String patterns. + for flags in (0, regex.UNICODE): + pat = regex.compile('\xc0', flags | regex.IGNORECASE) + self.assertEqual(bool(pat.match('\xe0')), True) + pat = regex.compile(r'\w', flags) + self.assertEqual(bool(pat.match('\xe0')), True) + + pat = regex.compile('\xc0', regex.ASCII | regex.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = regex.compile('(?a)\xc0', regex.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = regex.compile(r'\w', regex.ASCII) + self.assertEqual(pat.match('\xe0'), None) + pat = regex.compile(r'(?a)\w') + self.assertEqual(pat.match('\xe0'), None) + + # Bytes patterns. + for flags in (0, regex.ASCII): + pat = regex.compile(b'\xc0', flags | regex.IGNORECASE) + self.assertEqual(pat.match(b'\xe0'), None) + pat = regex.compile(br'\w') + self.assertEqual(pat.match(b'\xe0'), None) + + self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: + regex.compile(r'(?au)\w')) + + def test_subscripting_match(self): + m = regex.match(r'(?\w)', 'xy') + if not m: + self.fail("Failed: expected match but returned None") + elif not m or m[0] != m.group(0) or m[1] != m.group(1): + self.fail("Failed") + if not m: + self.fail("Failed: expected match but returned None") + elif m[:] != ('x', 'x'): + self.fail("Failed: expected \"('x', 'x')\" but got {} instead".format(ascii(m[:]))) + + def test_new_named_groups(self): + m0 = regex.match(r'(?P\w)', 'x') + m1 = regex.match(r'(?\w)', 'x') + if not (m0 and m1 and m0[:] == m1[:]): + self.fail("Failed") + + def test_properties(self): + self.assertEqual(regex.match(b'(?ai)\xC0', b'\xE0'), None) + self.assertEqual(regex.match(br'(?ai)\xC0', b'\xE0'), None) + self.assertEqual(regex.match(br'(?a)\w', b'\xE0'), None) + self.assertEqual(bool(regex.match(r'\w', '\xE0')), True) + + # Dropped the following test. It's not possible to determine what the + # correct result should be in the general case. +# self.assertEqual(bool(regex.match(br'(?L)\w', b'\xE0')), +# b'\xE0'.isalnum()) + + self.assertEqual(bool(regex.match(br'(?L)\d', b'0')), True) + self.assertEqual(bool(regex.match(br'(?L)\s', b' ')), True) + self.assertEqual(bool(regex.match(br'(?L)\w', b'a')), True) + self.assertEqual(regex.match(br'(?L)\d', b'?'), None) + self.assertEqual(regex.match(br'(?L)\s', b'?'), None) + self.assertEqual(regex.match(br'(?L)\w', b'?'), None) + + self.assertEqual(regex.match(br'(?L)\D', b'0'), None) + self.assertEqual(regex.match(br'(?L)\S', b' '), None) + self.assertEqual(regex.match(br'(?L)\W', b'a'), None) + self.assertEqual(bool(regex.match(br'(?L)\D', b'?')), True) + self.assertEqual(bool(regex.match(br'(?L)\S', b'?')), True) + self.assertEqual(bool(regex.match(br'(?L)\W', b'?')), True) + + self.assertEqual(bool(regex.match(r'\p{Cyrillic}', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'(?i)\p{Cyrillic}', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{IsCyrillic}', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{Script=Cyrillic}', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{InCyrillic}', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{Block=Cyrillic}', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:Cyrillic:]]', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:IsCyrillic:]]', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:Script=Cyrillic:]]', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:InCyrillic:]]', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:Block=Cyrillic:]]', + '\N{CYRILLIC CAPITAL LETTER A}')), True) + + self.assertEqual(bool(regex.match(r'\P{Cyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\P{IsCyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\P{Script=Cyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\P{InCyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\P{Block=Cyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{^Cyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{^IsCyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{^Script=Cyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{^InCyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'\p{^Block=Cyrillic}', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:^Cyrillic:]]', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:^IsCyrillic:]]', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:^Script=Cyrillic:]]', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:^InCyrillic:]]', + '\N{LATIN CAPITAL LETTER A}')), True) + self.assertEqual(bool(regex.match(r'[[:^Block=Cyrillic:]]', + '\N{LATIN CAPITAL LETTER A}')), True) + + self.assertEqual(bool(regex.match(r'\d', '0')), True) + self.assertEqual(bool(regex.match(r'\s', ' ')), True) + self.assertEqual(bool(regex.match(r'\w', 'A')), True) + self.assertEqual(regex.match(r"\d", "?"), None) + self.assertEqual(regex.match(r"\s", "?"), None) + self.assertEqual(regex.match(r"\w", "?"), None) + self.assertEqual(regex.match(r"\D", "0"), None) + self.assertEqual(regex.match(r"\S", " "), None) + self.assertEqual(regex.match(r"\W", "A"), None) + self.assertEqual(bool(regex.match(r'\D', '?')), True) + self.assertEqual(bool(regex.match(r'\S', '?')), True) + self.assertEqual(bool(regex.match(r'\W', '?')), True) + + self.assertEqual(bool(regex.match(r'\p{L}', 'A')), True) + self.assertEqual(bool(regex.match(r'\p{L}', 'a')), True) + self.assertEqual(bool(regex.match(r'\p{Lu}', 'A')), True) + self.assertEqual(bool(regex.match(r'\p{Ll}', 'a')), True) + + self.assertEqual(bool(regex.match(r'(?i)a', 'a')), True) + self.assertEqual(bool(regex.match(r'(?i)a', 'A')), True) + + self.assertEqual(bool(regex.match(r'\w', '0')), True) + self.assertEqual(bool(regex.match(r'\w', 'a')), True) + self.assertEqual(bool(regex.match(r'\w', '_')), True) + + self.assertEqual(regex.match(r"\X", "\xE0").span(), (0, 1)) + self.assertEqual(regex.match(r"\X", "a\u0300").span(), (0, 2)) + self.assertEqual(regex.findall(r"\X", + "a\xE0a\u0300e\xE9e\u0301"), ['a', '\xe0', 'a\u0300', 'e', + '\xe9', 'e\u0301']) + self.assertEqual(regex.findall(r"\X{3}", + "a\xE0a\u0300e\xE9e\u0301"), ['a\xe0a\u0300', 'e\xe9e\u0301']) + self.assertEqual(regex.findall(r"\X", "\r\r\n\u0301A\u0301"), + ['\r', '\r\n', '\u0301', 'A\u0301']) + + self.assertEqual(bool(regex.match(r'\p{Ll}', 'a')), True) + + chars_u = "-09AZaz_\u0393\u03b3" + chars_b = b"-09AZaz_" + word_set = set("Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc".split()) + + tests = [ + (r"\w", chars_u, "09AZaz_\u0393\u03b3"), + (r"[[:word:]]", chars_u, "09AZaz_\u0393\u03b3"), + (r"\W", chars_u, "-"), + (r"[[:^word:]]", chars_u, "-"), + (r"\d", chars_u, "09"), + (r"[[:digit:]]", chars_u, "09"), + (r"\D", chars_u, "-AZaz_\u0393\u03b3"), + (r"[[:^digit:]]", chars_u, "-AZaz_\u0393\u03b3"), + (r"[[:alpha:]]", chars_u, "AZaz\u0393\u03b3"), + (r"[[:^alpha:]]", chars_u, "-09_"), + (r"[[:alnum:]]", chars_u, "09AZaz\u0393\u03b3"), + (r"[[:^alnum:]]", chars_u, "-_"), + (r"[[:xdigit:]]", chars_u, "09Aa"), + (r"[[:^xdigit:]]", chars_u, "-Zz_\u0393\u03b3"), + (r"\p{InBasicLatin}", "a\xE1", "a"), + (r"\P{InBasicLatin}", "a\xE1", "\xE1"), + (r"(?i)\p{InBasicLatin}", "a\xE1", "a"), + (r"(?i)\P{InBasicLatin}", "a\xE1", "\xE1"), + + (br"(?L)\w", chars_b, b"09AZaz_"), + (br"(?L)[[:word:]]", chars_b, b"09AZaz_"), + (br"(?L)\W", chars_b, b"-"), + (br"(?L)[[:^word:]]", chars_b, b"-"), + (br"(?L)\d", chars_b, b"09"), + (br"(?L)[[:digit:]]", chars_b, b"09"), + (br"(?L)\D", chars_b, b"-AZaz_"), + (br"(?L)[[:^digit:]]", chars_b, b"-AZaz_"), + (br"(?L)[[:alpha:]]", chars_b, b"AZaz"), + (br"(?L)[[:^alpha:]]", chars_b, b"-09_"), + (br"(?L)[[:alnum:]]", chars_b, b"09AZaz"), + (br"(?L)[[:^alnum:]]", chars_b, b"-_"), + (br"(?L)[[:xdigit:]]", chars_b, b"09Aa"), + (br"(?L)[[:^xdigit:]]", chars_b, b"-Zz_"), + + (br"(?a)\w", chars_b, b"09AZaz_"), + (br"(?a)[[:word:]]", chars_b, b"09AZaz_"), + (br"(?a)\W", chars_b, b"-"), + (br"(?a)[[:^word:]]", chars_b, b"-"), + (br"(?a)\d", chars_b, b"09"), + (br"(?a)[[:digit:]]", chars_b, b"09"), + (br"(?a)\D", chars_b, b"-AZaz_"), + (br"(?a)[[:^digit:]]", chars_b, b"-AZaz_"), + (br"(?a)[[:alpha:]]", chars_b, b"AZaz"), + (br"(?a)[[:^alpha:]]", chars_b, b"-09_"), + (br"(?a)[[:alnum:]]", chars_b, b"09AZaz"), + (br"(?a)[[:^alnum:]]", chars_b, b"-_"), + (br"(?a)[[:xdigit:]]", chars_b, b"09Aa"), + (br"(?a)[[:^xdigit:]]", chars_b, b"-Zz_"), + ] + for pattern, chars, expected in tests: + try: + if chars[ : 0].join(regex.findall(pattern, chars)) != expected: + self.fail("Failed: {}".format(pattern)) + except Exception as e: + self.fail("Failed: {} raised {}".format(pattern, ascii(e))) + + self.assertEqual(bool(regex.match(r"\p{NumericValue=0}", "0")), + True) + self.assertEqual(bool(regex.match(r"\p{NumericValue=1/2}", + "\N{VULGAR FRACTION ONE HALF}")), True) + self.assertEqual(bool(regex.match(r"\p{NumericValue=0.5}", + "\N{VULGAR FRACTION ONE HALF}")), True) + + def test_word_class(self): + self.assertEqual(regex.findall(r"\w+", + " \u0939\u093f\u0928\u094d\u0926\u0940,"), + ['\u0939\u093f\u0928\u094d\u0926\u0940']) + self.assertEqual(regex.findall(r"\W+", + " \u0939\u093f\u0928\u094d\u0926\u0940,"), [' ', ',']) + self.assertEqual(regex.split(r"(?V1)\b", + " \u0939\u093f\u0928\u094d\u0926\u0940,"), [' ', + '\u0939\u093f\u0928\u094d\u0926\u0940', ',']) + self.assertEqual(regex.split(r"(?V1)\B", + " \u0939\u093f\u0928\u094d\u0926\u0940,"), ['', ' \u0939', + '\u093f', '\u0928', '\u094d', '\u0926', '\u0940,', '']) + + def test_search_anchor(self): + self.assertEqual(regex.findall(r"\G\w{2}", "abcd ef"), ['ab', 'cd']) + + def test_search_reverse(self): + self.assertEqual(regex.findall(r"(?r).", "abc"), ['c', 'b', 'a']) + self.assertEqual(regex.findall(r"(?r).", "abc", overlapped=True), ['c', + 'b', 'a']) + self.assertEqual(regex.findall(r"(?r)..", "abcde"), ['de', 'bc']) + self.assertEqual(regex.findall(r"(?r)..", "abcde", overlapped=True), + ['de', 'cd', 'bc', 'ab']) + self.assertEqual(regex.findall(r"(?r)(.)(-)(.)", "a-b-c", + overlapped=True), [("b", "-", "c"), ("a", "-", "b")]) + + self.assertEqual([m[0] for m in regex.finditer(r"(?r).", "abc")], ['c', + 'b', 'a']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", + overlapped=True)], ['de', 'cd', 'bc', 'ab']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r).", "abc")], ['c', + 'b', 'a']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", + overlapped=True)], ['de', 'cd', 'bc', 'ab']) + + self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo', + '']) + self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', + 'foo', '']) + + self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], + ['', 'foo', 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", + "foo bar")], ['', 'foo', 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", + "foo bar")], ['bar', 'foo', '']) + self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", + "foo bar")], ['bar', 'foo', '']) + + self.assertEqual(regex.findall(r"\G\w{2}", "abcd ef"), ['ab', 'cd']) + self.assertEqual(regex.findall(r".{2}(?<=\G.*)", "abcd"), ['ab', 'cd']) + self.assertEqual(regex.findall(r"(?r)\G\w{2}", "abcd ef"), []) + self.assertEqual(regex.findall(r"(?r)\w{2}\G", "abcd ef"), ['ef']) + + self.assertEqual(regex.findall(r"q*", "qqwe"), ['qq', '', '', '']) + self.assertEqual(regex.findall(r"(?V1)q*", "qqwe"), ['qq', '', '', '']) + self.assertEqual(regex.findall(r"(?r)q*", "qqwe"), ['', '', 'qq', '']) + self.assertEqual(regex.findall(r"(?rV1)q*", "qqwe"), ['', '', 'qq', + '']) + + self.assertEqual(regex.findall(".", "abcd", pos=1, endpos=3), ['b', + 'c']) + self.assertEqual(regex.findall(".", "abcd", pos=1, endpos=-1), ['b', + 'c']) + self.assertEqual([m[0] for m in regex.finditer(".", "abcd", pos=1, + endpos=3)], ['b', 'c']) + self.assertEqual([m[0] for m in regex.finditer(".", "abcd", pos=1, + endpos=-1)], ['b', 'c']) + + self.assertEqual([m[0] for m in regex.finditer("(?r).", "abcd", pos=1, + endpos=3)], ['c', 'b']) + self.assertEqual([m[0] for m in regex.finditer("(?r).", "abcd", pos=1, + endpos=-1)], ['c', 'b']) + self.assertEqual(regex.findall("(?r).", "abcd", pos=1, endpos=3), ['c', + 'b']) + self.assertEqual(regex.findall("(?r).", "abcd", pos=1, endpos=-1), + ['c', 'b']) + + self.assertEqual(regex.findall(r"[ab]", "aB", regex.I), ['a', 'B']) + self.assertEqual(regex.findall(r"(?r)[ab]", "aB", regex.I), ['B', 'a']) + + self.assertEqual(regex.findall(r"(?r).{2}", "abc"), ['bc']) + self.assertEqual(regex.findall(r"(?r).{2}", "abc", overlapped=True), + ['bc', 'ab']) + self.assertEqual(regex.findall(r"(\w+) (\w+)", + "first second third fourth fifth"), [('first', 'second'), ('third', + 'fourth')]) + self.assertEqual(regex.findall(r"(?r)(\w+) (\w+)", + "first second third fourth fifth"), [('fourth', 'fifth'), ('second', + 'third')]) + + self.assertEqual([m[0] for m in regex.finditer(r"(?r).{2}", "abc")], + ['bc']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r).{2}", "abc", + overlapped=True)], ['bc', 'ab']) + self.assertEqual([m[0] for m in regex.finditer(r"(\w+) (\w+)", + "first second third fourth fifth")], ['first second', + 'third fourth']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)(\w+) (\w+)", + "first second third fourth fifth")], ['fourth fifth', + 'second third']) + + self.assertEqual(regex.search("abcdef", "abcdef").span(), (0, 6)) + self.assertEqual(regex.search("(?r)abcdef", "abcdef").span(), (0, 6)) + self.assertEqual(regex.search("(?i)abcdef", "ABCDEF").span(), (0, 6)) + self.assertEqual(regex.search("(?ir)abcdef", "ABCDEF").span(), (0, 6)) + + self.assertEqual(regex.sub(r"(.)", r"\1", "abc"), 'abc') + self.assertEqual(regex.sub(r"(?r)(.)", r"\1", "abc"), 'abc') + + def test_atomic(self): + # Issue 433030. + self.assertEqual(regex.search(r"(?>a*)a", "aa"), None) + + def test_possessive(self): + # Single-character non-possessive. + self.assertEqual(regex.search(r"a?a", "a").span(), (0, 1)) + self.assertEqual(regex.search(r"a*a", "aaa").span(), (0, 3)) + self.assertEqual(regex.search(r"a+a", "aaa").span(), (0, 3)) + self.assertEqual(regex.search(r"a{1,3}a", "aaa").span(), (0, 3)) + + # Multiple-character non-possessive. + self.assertEqual(regex.search(r"(?:ab)?ab", "ab").span(), (0, 2)) + self.assertEqual(regex.search(r"(?:ab)*ab", "ababab").span(), (0, 6)) + self.assertEqual(regex.search(r"(?:ab)+ab", "ababab").span(), (0, 6)) + self.assertEqual(regex.search(r"(?:ab){1,3}ab", "ababab").span(), (0, + 6)) + + # Single-character possessive. + self.assertEqual(regex.search(r"a?+a", "a"), None) + self.assertEqual(regex.search(r"a*+a", "aaa"), None) + self.assertEqual(regex.search(r"a++a", "aaa"), None) + self.assertEqual(regex.search(r"a{1,3}+a", "aaa"), None) + + # Multiple-character possessive. + self.assertEqual(regex.search(r"(?:ab)?+ab", "ab"), None) + self.assertEqual(regex.search(r"(?:ab)*+ab", "ababab"), None) + self.assertEqual(regex.search(r"(?:ab)++ab", "ababab"), None) + self.assertEqual(regex.search(r"(?:ab){1,3}+ab", "ababab"), None) + + def test_zerowidth(self): + # Issue 3262. + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.split(r"\b", "a b"), ['', 'a', ' ', 'b', + '']) + else: + self.assertEqual(regex.split(r"\b", "a b"), ['a b']) + self.assertEqual(regex.split(r"(?V1)\b", "a b"), ['', 'a', ' ', 'b', + '']) + + # Issue 1647489. + self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], + ['', 'foo', 'bar']) + self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', + 'foo', '']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", + "foo bar")], ['bar', 'foo', '']) + self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', + 'bar']) + self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", + "foo bar")], ['', 'foo', 'bar']) + self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', + 'foo', '']) + self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", + "foo bar")], ['bar', 'foo', '']) + + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.split("", "xaxbxc"), ['', 'x', 'a', 'x', + 'b', 'x', 'c', '']) + self.assertEqual([m for m in regex.splititer("", "xaxbxc")], ['', + 'x', 'a', 'x', 'b', 'x', 'c', '']) + else: + self.assertEqual(regex.split("", "xaxbxc"), ['xaxbxc']) + self.assertEqual([m for m in regex.splititer("", "xaxbxc")], + ['xaxbxc']) + + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.split("(?r)", "xaxbxc"), ['', 'c', 'x', 'b', + 'x', 'a', 'x', '']) + self.assertEqual([m for m in regex.splititer("(?r)", "xaxbxc")], + ['', 'c', 'x', 'b', 'x', 'a', 'x', '']) + else: + self.assertEqual(regex.split("(?r)", "xaxbxc"), ['xaxbxc']) + self.assertEqual([m for m in regex.splititer("(?r)", "xaxbxc")], + ['xaxbxc']) + + self.assertEqual(regex.split("(?V1)", "xaxbxc"), ['', 'x', 'a', 'x', + 'b', 'x', 'c', '']) + self.assertEqual([m for m in regex.splititer("(?V1)", "xaxbxc")], ['', + 'x', 'a', 'x', 'b', 'x', 'c', '']) + + self.assertEqual(regex.split("(?rV1)", "xaxbxc"), ['', 'c', 'x', 'b', + 'x', 'a', 'x', '']) + self.assertEqual([m for m in regex.splititer("(?rV1)", "xaxbxc")], ['', + 'c', 'x', 'b', 'x', 'a', 'x', '']) + + def test_scoped_and_inline_flags(self): + # Issues 433028, 433024, 433027. + self.assertEqual(regex.search(r"(?i)Ab", "ab").span(), (0, 2)) + self.assertEqual(regex.search(r"(?i:A)b", "ab").span(), (0, 2)) + # Changed to positional flags in regex 2023.12.23. + self.assertEqual(regex.search(r"A(?i)b", "ab"), None) + + self.assertEqual(regex.search(r"(?V0)Ab", "ab"), None) + self.assertEqual(regex.search(r"(?V1)Ab", "ab"), None) + self.assertEqual(regex.search(r"(?-i)Ab", "ab", flags=regex.I), None) + self.assertEqual(regex.search(r"(?-i:A)b", "ab", flags=regex.I), None) + self.assertEqual(regex.search(r"A(?-i)b", "ab", flags=regex.I).span(), + (0, 2)) + + def test_repeated_repeats(self): + # Issue 2537. + self.assertEqual(regex.search(r"(?:a+)+", "aaa").span(), (0, 3)) + self.assertEqual(regex.search(r"(?:(?:ab)+c)+", "abcabc").span(), (0, + 6)) + + # Hg issue 286. + self.assertEqual(regex.search(r"(?:a+){2,}", "aaa").span(), (0, 3)) + + def test_lookbehind(self): + self.assertEqual(regex.search(r"123(?<=a\d+)", "a123").span(), (1, 4)) + self.assertEqual(regex.search(r"123(?<=a\d+)", "b123"), None) + self.assertEqual(regex.search(r"123(?= (3, 7, 0): + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "xy"), + 'y-x-') + else: + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "xy"), + 'y-x') + self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "xy"), 'y-x-') + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "x"), '-x-') + else: + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "x"), '-x') + self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "x"), '-x-') + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "y"), 'y--') + else: + self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "y"), 'y-') + self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "y"), 'y--') + + def test_bug_10328 (self): + # Issue 10328. + pat = regex.compile(r'(?mV0)(?P[ \t]+\r*$)|(?P(?<=[^\n])\Z)') + if sys.version_info >= (3, 7, 0): + self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', + 'foobar '), ('foobar', 2)) + else: + self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', + 'foobar '), ('foobar', 1)) + self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', + '']) + pat = regex.compile(r'(?mV1)(?P[ \t]+\r*$)|(?P(?<=[^\n])\Z)') + self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', + 'foobar '), ('foobar', 2)) + self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', + '']) + + def test_overlapped(self): + self.assertEqual(regex.findall(r"..", "abcde"), ['ab', 'cd']) + self.assertEqual(regex.findall(r"..", "abcde", overlapped=True), ['ab', + 'bc', 'cd', 'de']) + self.assertEqual(regex.findall(r"(?r)..", "abcde"), ['de', 'bc']) + self.assertEqual(regex.findall(r"(?r)..", "abcde", overlapped=True), + ['de', 'cd', 'bc', 'ab']) + self.assertEqual(regex.findall(r"(.)(-)(.)", "a-b-c", overlapped=True), + [("a", "-", "b"), ("b", "-", "c")]) + + self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde")], ['ab', + 'cd']) + self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde", + overlapped=True)], ['ab', 'bc', 'cd', 'de']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde")], + ['de', 'bc']) + self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", + overlapped=True)], ['de', 'cd', 'bc', 'ab']) + + self.assertEqual([m.groups() for m in regex.finditer(r"(.)(-)(.)", + "a-b-c", overlapped=True)], [("a", "-", "b"), ("b", "-", "c")]) + self.assertEqual([m.groups() for m in regex.finditer(r"(?r)(.)(-)(.)", + "a-b-c", overlapped=True)], [("b", "-", "c"), ("a", "-", "b")]) + + def test_splititer(self): + self.assertEqual(regex.split(r",", "a,b,,c,"), ['a', 'b', '', 'c', '']) + self.assertEqual([m for m in regex.splititer(r",", "a,b,,c,")], ['a', + 'b', '', 'c', '']) + + def test_grapheme(self): + self.assertEqual(regex.match(r"\X", "\xE0").span(), (0, 1)) + self.assertEqual(regex.match(r"\X", "a\u0300").span(), (0, 2)) + + self.assertEqual(regex.findall(r"\X", + "a\xE0a\u0300e\xE9e\u0301"), ['a', '\xe0', 'a\u0300', 'e', + '\xe9', 'e\u0301']) + self.assertEqual(regex.findall(r"\X{3}", + "a\xE0a\u0300e\xE9e\u0301"), ['a\xe0a\u0300', 'e\xe9e\u0301']) + self.assertEqual(regex.findall(r"\X", "\r\r\n\u0301A\u0301"), + ['\r', '\r\n', '\u0301', 'A\u0301']) + + def test_word_boundary(self): + text = 'The quick ("brown") fox can\'t jump 32.3 feet, right?' + self.assertEqual(regex.split(r'(?V1)\b', text), ['', 'The', ' ', + 'quick', ' ("', 'brown', '") ', 'fox', ' ', 'can', "'", 't', + ' ', 'jump', ' ', '32', '.', '3', ' ', 'feet', ', ', + 'right', '?']) + self.assertEqual(regex.split(r'(?V1w)\b', text), ['', 'The', ' ', + 'quick', ' ', '(', '"', 'brown', '"', ')', ' ', 'fox', ' ', + "can't", ' ', 'jump', ' ', '32.3', ' ', 'feet', ',', ' ', + 'right', '?', '']) + + text = "The fox" + self.assertEqual(regex.split(r'(?V1)\b', text), ['', 'The', ' ', + 'fox', '']) + self.assertEqual(regex.split(r'(?V1w)\b', text), ['', 'The', ' ', + 'fox', '']) + + text = "can't aujourd'hui l'objectif" + self.assertEqual(regex.split(r'(?V1)\b', text), ['', 'can', "'", + 't', ' ', 'aujourd', "'", 'hui', ' ', 'l', "'", 'objectif', + '']) + self.assertEqual(regex.split(r'(?V1w)\b', text), ['', "can't", ' ', + "aujourd'hui", ' ', "l'objectif", '']) + + def test_line_boundary(self): + self.assertEqual(regex.findall(r".+", "Line 1\nLine 2\n"), ["Line 1", + "Line 2"]) + self.assertEqual(regex.findall(r".+", "Line 1\rLine 2\r"), + ["Line 1\rLine 2\r"]) + self.assertEqual(regex.findall(r".+", "Line 1\r\nLine 2\r\n"), + ["Line 1\r", "Line 2\r"]) + self.assertEqual(regex.findall(r"(?w).+", "Line 1\nLine 2\n"), + ["Line 1", "Line 2"]) + self.assertEqual(regex.findall(r"(?w).+", "Line 1\rLine 2\r"), + ["Line 1", "Line 2"]) + self.assertEqual(regex.findall(r"(?w).+", "Line 1\r\nLine 2\r\n"), + ["Line 1", "Line 2"]) + + self.assertEqual(regex.search(r"^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"^abc", "\nabc"), None) + self.assertEqual(regex.search(r"^abc", "\rabc"), None) + self.assertEqual(regex.search(r"(?w)^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"(?w)^abc", "\nabc"), None) + self.assertEqual(regex.search(r"(?w)^abc", "\rabc"), None) + + self.assertEqual(regex.search(r"abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"abc$", "abc\r"), None) + self.assertEqual(regex.search(r"(?w)abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"(?w)abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"(?w)abc$", "abc\r").start(), 0) + + self.assertEqual(regex.search(r"(?m)^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"(?m)^abc", "\nabc").start(), 1) + self.assertEqual(regex.search(r"(?m)^abc", "\rabc"), None) + self.assertEqual(regex.search(r"(?mw)^abc", "abc").start(), 0) + self.assertEqual(regex.search(r"(?mw)^abc", "\nabc").start(), 1) + self.assertEqual(regex.search(r"(?mw)^abc", "\rabc").start(), 1) + + self.assertEqual(regex.search(r"(?m)abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"(?m)abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"(?m)abc$", "abc\r"), None) + self.assertEqual(regex.search(r"(?mw)abc$", "abc").start(), 0) + self.assertEqual(regex.search(r"(?mw)abc$", "abc\n").start(), 0) + self.assertEqual(regex.search(r"(?mw)abc$", "abc\r").start(), 0) + + def test_branch_reset(self): + self.assertEqual(regex.match(r"(?:(a)|(b))(c)", "ac").groups(), ('a', + None, 'c')) + self.assertEqual(regex.match(r"(?:(a)|(b))(c)", "bc").groups(), (None, + 'b', 'c')) + self.assertEqual(regex.match(r"(?:(?a)|(?b))(?c)", + "ac").groups(), ('a', None, 'c')) + self.assertEqual(regex.match(r"(?:(?a)|(?b))(?c)", + "bc").groups(), (None, 'b', 'c')) + + self.assertEqual(regex.match(r"(?a)(?:(?b)|(?c))(?d)", + "abd").groups(), ('a', 'b', None, 'd')) + self.assertEqual(regex.match(r"(?a)(?:(?b)|(?c))(?d)", + "acd").groups(), ('a', None, 'c', 'd')) + self.assertEqual(regex.match(r"(a)(?:(b)|(c))(d)", "abd").groups(), + ('a', 'b', None, 'd')) + + self.assertEqual(regex.match(r"(a)(?:(b)|(c))(d)", "acd").groups(), + ('a', None, 'c', 'd')) + self.assertEqual(regex.match(r"(a)(?|(b)|(b))(d)", "abd").groups(), + ('a', 'b', 'd')) + self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "ac").groups(), + ('a', None, 'c')) + self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "bc").groups(), + (None, 'b', 'c')) + self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "ac").groups(), + ('a', 'c')) + + self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "bc").groups(), + ('b', 'c')) + + self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(?d))(e)", + "abe").groups(), ('a', 'b', 'e')) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(?d))(e)", + "cde").groups(), ('d', 'c', 'e')) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(d))(e)", + "abe").groups(), ('a', 'b', 'e')) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(d))(e)", + "cde").groups(), ('d', 'c', 'e')) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(d))(e)", + "abe").groups(), ('a', 'b', 'e')) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(d))(e)", + "cde").groups(), ('c', 'd', 'e')) + + # Hg issue 87: Allow duplicate names of groups + self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", + "abe").groups(), ("a", "b", "e")) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", + "abe").capturesdict(), {"a": ["a"], "b": ["b"]}) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", + "cde").groups(), ("d", None, "e")) + self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", + "cde").capturesdict(), {"a": ["c", "d"], "b": []}) + + def test_set(self): + self.assertEqual(regex.match(r"[a]", "a").span(), (0, 1)) + self.assertEqual(regex.match(r"(?i)[a]", "A").span(), (0, 1)) + self.assertEqual(regex.match(r"[a-b]", r"a").span(), (0, 1)) + self.assertEqual(regex.match(r"(?i)[a-b]", r"A").span(), (0, 1)) + + self.assertEqual(regex.sub(r"(?V0)([][])", r"-", "a[b]c"), "a-b-c") + + self.assertEqual(regex.findall(r"[\p{Alpha}]", "a0"), ["a"]) + self.assertEqual(regex.findall(r"(?i)[\p{Alpha}]", "A0"), ["A"]) + + self.assertEqual(regex.findall(r"[a\p{Alpha}]", "ab0"), ["a", "b"]) + self.assertEqual(regex.findall(r"[a\P{Alpha}]", "ab0"), ["a", "0"]) + self.assertEqual(regex.findall(r"(?i)[a\p{Alpha}]", "ab0"), ["a", + "b"]) + self.assertEqual(regex.findall(r"(?i)[a\P{Alpha}]", "ab0"), ["a", + "0"]) + + self.assertEqual(regex.findall(r"[a-b\p{Alpha}]", "abC0"), ["a", + "b", "C"]) + self.assertEqual(regex.findall(r"(?i)[a-b\p{Alpha}]", "AbC0"), ["A", + "b", "C"]) + + self.assertEqual(regex.findall(r"[\p{Alpha}]", "a0"), ["a"]) + self.assertEqual(regex.findall(r"[\P{Alpha}]", "a0"), ["0"]) + self.assertEqual(regex.findall(r"[^\p{Alpha}]", "a0"), ["0"]) + self.assertEqual(regex.findall(r"[^\P{Alpha}]", "a0"), ["a"]) + + self.assertEqual("".join(regex.findall(r"[^\d-h]", "a^b12c-h")), + 'a^bc') + self.assertEqual("".join(regex.findall(r"[^\dh]", "a^b12c-h")), + 'a^bc-') + self.assertEqual("".join(regex.findall(r"[^h\s\db]", "a^b 12c-h")), + 'a^c-') + self.assertEqual("".join(regex.findall(r"[^b\w]", "a b")), ' ') + self.assertEqual("".join(regex.findall(r"[^b\S]", "a b")), ' ') + self.assertEqual("".join(regex.findall(r"[^8\d]", "a 1b2")), 'a b') + + all_chars = "".join(chr(c) for c in range(0x100)) + self.assertEqual(len(regex.findall(r"\p{ASCII}", all_chars)), 128) + self.assertEqual(len(regex.findall(r"\p{Letter}", all_chars)), + 117) + self.assertEqual(len(regex.findall(r"\p{Digit}", all_chars)), 10) + + # Set operators + self.assertEqual(len(regex.findall(r"(?V1)[\p{ASCII}&&\p{Letter}]", + all_chars)), 52) + self.assertEqual(len(regex.findall(r"(?V1)[\p{ASCII}&&\p{Alnum}&&\p{Letter}]", + all_chars)), 52) + self.assertEqual(len(regex.findall(r"(?V1)[\p{ASCII}&&\p{Alnum}&&\p{Digit}]", + all_chars)), 10) + self.assertEqual(len(regex.findall(r"(?V1)[\p{ASCII}&&\p{Cc}]", + all_chars)), 33) + self.assertEqual(len(regex.findall(r"(?V1)[\p{ASCII}&&\p{Graph}]", + all_chars)), 94) + self.assertEqual(len(regex.findall(r"(?V1)[\p{ASCII}--\p{Cc}]", + all_chars)), 95) + self.assertEqual(len(regex.findall(r"[\p{Letter}\p{Digit}]", + all_chars)), 127) + self.assertEqual(len(regex.findall(r"(?V1)[\p{Letter}||\p{Digit}]", + all_chars)), 127) + self.assertEqual(len(regex.findall(r"\p{HexDigit}", all_chars)), + 22) + self.assertEqual(len(regex.findall(r"(?V1)[\p{HexDigit}~~\p{Digit}]", + all_chars)), 12) + self.assertEqual(len(regex.findall(r"(?V1)[\p{Digit}~~\p{HexDigit}]", + all_chars)), 12) + + self.assertEqual(repr(type(regex.compile(r"(?V0)([][-])"))), + self.PATTERN_CLASS) + self.assertEqual(regex.findall(r"(?V1)[[a-z]--[aei]]", "abc"), ["b", + "c"]) + self.assertEqual(regex.findall(r"(?iV1)[[a-z]--[aei]]", "abc"), ["b", + "c"]) + self.assertEqual(regex.findall(r"(?V1)[\w--a]","abc"), ["b", "c"]) + self.assertEqual(regex.findall(r"(?iV1)[\w--a]","abc"), ["b", "c"]) + + def test_various(self): + tests = [ + # Test ?P< and ?P= extensions. + ('(?Pa)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with a digit. + ('(?Pa)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with an illegal char. + ('(?Pa)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with an illegal char. + + # Same tests, for the ?P= form. + ('(?Pa)(?P=foo_123', 'aa', '', regex.error, + self.MISSING_RPAREN), + ('(?Pa)(?P=1)', 'aa', '1', ascii('a')), + ('(?Pa)(?P=0)', 'aa', '', regex.error, + self.BAD_GROUP_NAME), + ('(?Pa)(?P=-1)', 'aa', '', regex.error, + self.BAD_GROUP_NAME), + ('(?Pa)(?P=!)', 'aa', '', regex.error, + self.BAD_GROUP_NAME), + ('(?Pa)(?P=foo_124)', 'aa', '', regex.error, + self.UNKNOWN_GROUP), # Backref to undefined group. + + ('(?Pa)', 'a', '1', ascii('a')), + ('(?Pa)(?P=foo_123)', 'aa', '1', ascii('a')), + + # Mal-formed \g in pattern treated as literal for compatibility. + (r'(?a)\ga)\g<1>', 'aa', '1', ascii('a')), + (r'(?a)\g', 'aa', '', ascii(None)), + (r'(?a)\g', 'aa', '', regex.error, + self.UNKNOWN_GROUP), # Backref to undefined group. + + ('(?a)', 'a', '1', ascii('a')), + (r'(?a)\g', 'aa', '1', ascii('a')), + + # Test octal escapes. + ('\\1', 'a', '', regex.error, self.INVALID_GROUP_REF), # Backreference. + ('[\\1]', '\1', '0', "'\\x01'"), # Character. + ('\\09', chr(0) + '9', '0', ascii(chr(0) + '9')), + ('\\141', 'a', '0', ascii('a')), + ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', + '0,11', ascii(('abcdefghijklk9', 'k'))), + + # Test \0 is handled everywhere. + (r'\0', '\0', '0', ascii('\0')), + (r'[\0a]', '\0', '0', ascii('\0')), + (r'[a\0]', '\0', '0', ascii('\0')), + (r'[^a\0]', '\0', '', ascii(None)), + + # Test various letter escapes. + (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', '0', + ascii('\a\b\f\n\r\t\v')), + (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', '0', + ascii('\a\b\f\n\r\t\v')), + (r'\xff', '\377', '0', ascii(chr(255))), + + # New \x semantics. + (r'\x00ffffffffffffff', '\377', '', ascii(None)), + (r'\x00f', '\017', '', ascii(None)), + (r'\x00fe', '\376', '', ascii(None)), + + (r'\x00ff', '\377', '', ascii(None)), + (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', '0', ascii('\t\n\v\r\f\ag')), + ('\t\n\v\r\f\a\\g', '\t\n\v\r\f\ag', '0', ascii('\t\n\v\r\f\ag')), + (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', '0', ascii(chr(9) + chr(10) + + chr(11) + chr(13) + chr(12) + chr(7))), + (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', '0', + ascii('\t\n\v\r\f\b')), + + (r"^\w+=(\\[\000-\277]|[^\n\\])*", + "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", '0', + ascii("SRC=eval.c g.c blah blah blah \\\\")), + + # Test that . only matches \n in DOTALL mode. + ('a.b', 'acb', '0', ascii('acb')), + ('a.b', 'a\nb', '', ascii(None)), + ('a.*b', 'acc\nccb', '', ascii(None)), + ('a.{4,5}b', 'acc\nccb', '', ascii(None)), + ('a.b', 'a\rb', '0', ascii('a\rb')), + # Changed to positional flags in regex 2023.12.23. + ('a.b(?s)', 'a\nb', '', ascii(None)), + ('(?s)a.b', 'a\nb', '0', ascii('a\nb')), + ('a.*(?s)b', 'acc\nccb', '', ascii(None)), + ('(?s)a.*b', 'acc\nccb', '0', ascii('acc\nccb')), + ('(?s)a.{4,5}b', 'acc\nccb', '0', ascii('acc\nccb')), + + (')', '', '', regex.error, self.TRAILING_CHARS), # Unmatched right bracket. + ('', '', '0', "''"), # Empty pattern. + ('abc', 'abc', '0', ascii('abc')), + ('abc', 'xbc', '', ascii(None)), + ('abc', 'axc', '', ascii(None)), + ('abc', 'abx', '', ascii(None)), + ('abc', 'xabcy', '0', ascii('abc')), + ('abc', 'ababc', '0', ascii('abc')), + ('ab*c', 'abc', '0', ascii('abc')), + ('ab*bc', 'abc', '0', ascii('abc')), + + ('ab*bc', 'abbc', '0', ascii('abbc')), + ('ab*bc', 'abbbbc', '0', ascii('abbbbc')), + ('ab+bc', 'abbc', '0', ascii('abbc')), + ('ab+bc', 'abc', '', ascii(None)), + ('ab+bc', 'abq', '', ascii(None)), + ('ab+bc', 'abbbbc', '0', ascii('abbbbc')), + ('ab?bc', 'abbc', '0', ascii('abbc')), + ('ab?bc', 'abc', '0', ascii('abc')), + ('ab?bc', 'abbbbc', '', ascii(None)), + ('ab?c', 'abc', '0', ascii('abc')), + + ('^abc$', 'abc', '0', ascii('abc')), + ('^abc$', 'abcc', '', ascii(None)), + ('^abc', 'abcc', '0', ascii('abc')), + ('^abc$', 'aabc', '', ascii(None)), + ('abc$', 'aabc', '0', ascii('abc')), + ('^', 'abc', '0', ascii('')), + ('$', 'abc', '0', ascii('')), + ('a.c', 'abc', '0', ascii('abc')), + ('a.c', 'axc', '0', ascii('axc')), + ('a.*c', 'axyzc', '0', ascii('axyzc')), + + ('a.*c', 'axyzd', '', ascii(None)), + ('a[bc]d', 'abc', '', ascii(None)), + ('a[bc]d', 'abd', '0', ascii('abd')), + ('a[b-d]e', 'abd', '', ascii(None)), + ('a[b-d]e', 'ace', '0', ascii('ace')), + ('a[b-d]', 'aac', '0', ascii('ac')), + ('a[-b]', 'a-', '0', ascii('a-')), + ('a[\\-b]', 'a-', '0', ascii('a-')), + ('a[b-]', 'a-', '0', ascii('a-')), + ('a[]b', '-', '', regex.error, self.BAD_SET), + + ('a[', '-', '', regex.error, self.BAD_SET), + ('a\\', '-', '', regex.error, self.BAD_ESCAPE), + ('abc)', '-', '', regex.error, self.TRAILING_CHARS), + ('(abc', '-', '', regex.error, self.MISSING_RPAREN), + ('a]', 'a]', '0', ascii('a]')), + ('a[]]b', 'a]b', '0', ascii('a]b')), + ('a[]]b', 'a]b', '0', ascii('a]b')), + ('a[^bc]d', 'aed', '0', ascii('aed')), + ('a[^bc]d', 'abd', '', ascii(None)), + ('a[^-b]c', 'adc', '0', ascii('adc')), + + ('a[^-b]c', 'a-c', '', ascii(None)), + ('a[^]b]c', 'a]c', '', ascii(None)), + ('a[^]b]c', 'adc', '0', ascii('adc')), + ('\\ba\\b', 'a-', '0', ascii('a')), + ('\\ba\\b', '-a', '0', ascii('a')), + ('\\ba\\b', '-a-', '0', ascii('a')), + ('\\by\\b', 'xy', '', ascii(None)), + ('\\by\\b', 'yz', '', ascii(None)), + ('\\by\\b', 'xyz', '', ascii(None)), + ('x\\b', 'xyz', '', ascii(None)), + + ('x\\B', 'xyz', '0', ascii('x')), + ('\\Bz', 'xyz', '0', ascii('z')), + ('z\\B', 'xyz', '', ascii(None)), + ('\\Bx', 'xyz', '', ascii(None)), + ('\\Ba\\B', 'a-', '', ascii(None)), + ('\\Ba\\B', '-a', '', ascii(None)), + ('\\Ba\\B', '-a-', '', ascii(None)), + ('\\By\\B', 'xy', '', ascii(None)), + ('\\By\\B', 'yz', '', ascii(None)), + ('\\By\\b', 'xy', '0', ascii('y')), + + ('\\by\\B', 'yz', '0', ascii('y')), + ('\\By\\B', 'xyz', '0', ascii('y')), + ('ab|cd', 'abc', '0', ascii('ab')), + ('ab|cd', 'abcd', '0', ascii('ab')), + ('()ef', 'def', '0,1', ascii(('ef', ''))), + ('$b', 'b', '', ascii(None)), + ('a\\(b', 'a(b', '', ascii(('a(b',))), + ('a\\(*b', 'ab', '0', ascii('ab')), + ('a\\(*b', 'a((b', '0', ascii('a((b')), + ('a\\\\b', 'a\\b', '0', ascii('a\\b')), + + ('((a))', 'abc', '0,1,2', ascii(('a', 'a', 'a'))), + ('(a)b(c)', 'abc', '0,1,2', ascii(('abc', 'a', 'c'))), + ('a+b+c', 'aabbabc', '0', ascii('abc')), + ('(a+|b)*', 'ab', '0,1', ascii(('ab', 'b'))), + ('(a+|b)+', 'ab', '0,1', ascii(('ab', 'b'))), + ('(a+|b)?', 'ab', '0,1', ascii(('a', 'a'))), + (')(', '-', '', regex.error, self.TRAILING_CHARS), + ('[^ab]*', 'cde', '0', ascii('cde')), + ('abc', '', '', ascii(None)), + ('a*', '', '0', ascii('')), + + ('a|b|c|d|e', 'e', '0', ascii('e')), + ('(a|b|c|d|e)f', 'ef', '0,1', ascii(('ef', 'e'))), + ('abcd*efg', 'abcdefg', '0', ascii('abcdefg')), + ('ab*', 'xabyabbbz', '0', ascii('ab')), + ('ab*', 'xayabbbz', '0', ascii('a')), + ('(ab|cd)e', 'abcde', '0,1', ascii(('cde', 'cd'))), + ('[abhgefdc]ij', 'hij', '0', ascii('hij')), + ('^(ab|cd)e', 'abcde', '', ascii(None)), + ('(abc|)ef', 'abcdef', '0,1', ascii(('ef', ''))), + ('(a|b)c*d', 'abcd', '0,1', ascii(('bcd', 'b'))), + + ('(ab|ab*)bc', 'abc', '0,1', ascii(('abc', 'a'))), + ('a([bc]*)c*', 'abc', '0,1', ascii(('abc', 'bc'))), + ('a([bc]*)(c*d)', 'abcd', '0,1,2', ascii(('abcd', 'bc', 'd'))), + ('a([bc]+)(c*d)', 'abcd', '0,1,2', ascii(('abcd', 'bc', 'd'))), + ('a([bc]*)(c+d)', 'abcd', '0,1,2', ascii(('abcd', 'b', 'cd'))), + ('a[bcd]*dcdcde', 'adcdcde', '0', ascii('adcdcde')), + ('a[bcd]+dcdcde', 'adcdcde', '', ascii(None)), + ('(ab|a)b*c', 'abc', '0,1', ascii(('abc', 'ab'))), + ('((a)(b)c)(d)', 'abcd', '1,2,3,4', ascii(('abc', 'a', 'b', 'd'))), + ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', '0', ascii('alpha')), + + ('^a(bc+|b[eh])g|.h$', 'abh', '0,1', ascii(('bh', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', '0,1,2', ascii(('effgz', + 'effgz', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'ij', '0,1,2', ascii(('ij', 'ij', + 'j'))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effg', '', ascii(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', '', ascii(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', '0,1,2', ascii(('effgz', + 'effgz', None))), + ('(((((((((a)))))))))', 'a', '0', ascii('a')), + ('multiple words of text', 'uh-uh', '', ascii(None)), + ('multiple words', 'multiple words, yeah', '0', + ascii('multiple words')), + ('(.*)c(.*)', 'abcde', '0,1,2', ascii(('abcde', 'ab', 'de'))), + + ('\\((.*), (.*)\\)', '(a, b)', '2,1', ascii(('b', 'a'))), + ('[k]', 'ab', '', ascii(None)), + ('a[-]?c', 'ac', '0', ascii('ac')), + ('(abc)\\1', 'abcabc', '1', ascii('abc')), + ('([a-c]*)\\1', 'abcabc', '1', ascii('abc')), + ('^(.+)?B', 'AB', '1', ascii('A')), + ('(a+).\\1$', 'aaaaa', '0,1', ascii(('aaaaa', 'aa'))), + ('^(a+).\\1$', 'aaaa', '', ascii(None)), + ('(abc)\\1', 'abcabc', '0,1', ascii(('abcabc', 'abc'))), + ('([a-c]+)\\1', 'abcabc', '0,1', ascii(('abcabc', 'abc'))), + + ('(a)\\1', 'aa', '0,1', ascii(('aa', 'a'))), + ('(a+)\\1', 'aa', '0,1', ascii(('aa', 'a'))), + ('(a+)+\\1', 'aa', '0,1', ascii(('aa', 'a'))), + ('(a).+\\1', 'aba', '0,1', ascii(('aba', 'a'))), + ('(a)ba*\\1', 'aba', '0,1', ascii(('aba', 'a'))), + ('(aa|a)a\\1$', 'aaa', '0,1', ascii(('aaa', 'a'))), + ('(a|aa)a\\1$', 'aaa', '0,1', ascii(('aaa', 'a'))), + ('(a+)a\\1$', 'aaa', '0,1', ascii(('aaa', 'a'))), + ('([abc]*)\\1', 'abcabc', '0,1', ascii(('abcabc', 'abc'))), + ('(a)(b)c|ab', 'ab', '0,1,2', ascii(('ab', None, None))), + + ('(a)+x', 'aaax', '0,1', ascii(('aaax', 'a'))), + ('([ac])+x', 'aacx', '0,1', ascii(('aacx', 'c'))), + ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', '0,1', + ascii(('d:msgs/tdir/sub1/', 'tdir/'))), + ('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', + '0,1,2,3', ascii(('track1.title:TBlah blah blah', 'track1', + 'title', 'Blah blah blah'))), + ('([^N]*N)+', 'abNNxyzN', '0,1', ascii(('abNNxyzN', 'xyzN'))), + ('([^N]*N)+', 'abNNxyz', '0,1', ascii(('abNN', 'N'))), + ('([abc]*)x', 'abcx', '0,1', ascii(('abcx', 'abc'))), + ('([abc]*)x', 'abc', '', ascii(None)), + ('([xyz]*)x', 'abcx', '0,1', ascii(('x', ''))), + ('(a)+b|aac', 'aac', '0,1', ascii(('aac', None))), + + # Test symbolic groups. + ('(?Paaa)a', 'aaaa', '', regex.error, self.BAD_GROUP_NAME), + ('(?Paaa)a', 'aaaa', '0,id', ascii(('aaaa', 'aaa'))), + ('(?Paa)(?P=id)', 'aaaa', '0,id', ascii(('aaaa', 'aa'))), + ('(?Paa)(?P=xd)', 'aaaa', '', regex.error, self.UNKNOWN_GROUP), + + # Character properties. + (r"\g", "g", '0', ascii('g')), + (r"\g<1>", "g", '', regex.error, self.INVALID_GROUP_REF), + (r"(.)\g<1>", "gg", '0', ascii('gg')), + (r"(.)\g<1>", "gg", '', ascii(('gg', 'g'))), + (r"\N", "N", '0', ascii('N')), + (r"\N{LATIN SMALL LETTER A}", "a", '0', ascii('a')), + (r"\p", "p", '0', ascii('p')), + (r"\p{Ll}", "a", '0', ascii('a')), + (r"\P", "P", '0', ascii('P')), + (r"\P{Lu}", "p", '0', ascii('p')), + + # All tests from Perl. + ('abc', 'abc', '0', ascii('abc')), + ('abc', 'xbc', '', ascii(None)), + ('abc', 'axc', '', ascii(None)), + ('abc', 'abx', '', ascii(None)), + ('abc', 'xabcy', '0', ascii('abc')), + ('abc', 'ababc', '0', ascii('abc')), + + ('ab*c', 'abc', '0', ascii('abc')), + ('ab*bc', 'abc', '0', ascii('abc')), + ('ab*bc', 'abbc', '0', ascii('abbc')), + ('ab*bc', 'abbbbc', '0', ascii('abbbbc')), + ('ab{0,}bc', 'abbbbc', '0', ascii('abbbbc')), + ('ab+bc', 'abbc', '0', ascii('abbc')), + ('ab+bc', 'abc', '', ascii(None)), + ('ab+bc', 'abq', '', ascii(None)), + ('ab{1,}bc', 'abq', '', ascii(None)), + ('ab+bc', 'abbbbc', '0', ascii('abbbbc')), + + ('ab{1,}bc', 'abbbbc', '0', ascii('abbbbc')), + ('ab{1,3}bc', 'abbbbc', '0', ascii('abbbbc')), + ('ab{3,4}bc', 'abbbbc', '0', ascii('abbbbc')), + ('ab{4,5}bc', 'abbbbc', '', ascii(None)), + ('ab?bc', 'abbc', '0', ascii('abbc')), + ('ab?bc', 'abc', '0', ascii('abc')), + ('ab{0,1}bc', 'abc', '0', ascii('abc')), + ('ab?bc', 'abbbbc', '', ascii(None)), + ('ab?c', 'abc', '0', ascii('abc')), + ('ab{0,1}c', 'abc', '0', ascii('abc')), + + ('^abc$', 'abc', '0', ascii('abc')), + ('^abc$', 'abcc', '', ascii(None)), + ('^abc', 'abcc', '0', ascii('abc')), + ('^abc$', 'aabc', '', ascii(None)), + ('abc$', 'aabc', '0', ascii('abc')), + ('^', 'abc', '0', ascii('')), + ('$', 'abc', '0', ascii('')), + ('a.c', 'abc', '0', ascii('abc')), + ('a.c', 'axc', '0', ascii('axc')), + ('a.*c', 'axyzc', '0', ascii('axyzc')), + + ('a.*c', 'axyzd', '', ascii(None)), + ('a[bc]d', 'abc', '', ascii(None)), + ('a[bc]d', 'abd', '0', ascii('abd')), + ('a[b-d]e', 'abd', '', ascii(None)), + ('a[b-d]e', 'ace', '0', ascii('ace')), + ('a[b-d]', 'aac', '0', ascii('ac')), + ('a[-b]', 'a-', '0', ascii('a-')), + ('a[b-]', 'a-', '0', ascii('a-')), + ('a[b-a]', '-', '', regex.error, self.BAD_CHAR_RANGE), + ('a[]b', '-', '', regex.error, self.BAD_SET), + + ('a[', '-', '', regex.error, self.BAD_SET), + ('a]', 'a]', '0', ascii('a]')), + ('a[]]b', 'a]b', '0', ascii('a]b')), + ('a[^bc]d', 'aed', '0', ascii('aed')), + ('a[^bc]d', 'abd', '', ascii(None)), + ('a[^-b]c', 'adc', '0', ascii('adc')), + ('a[^-b]c', 'a-c', '', ascii(None)), + ('a[^]b]c', 'a]c', '', ascii(None)), + ('a[^]b]c', 'adc', '0', ascii('adc')), + ('ab|cd', 'abc', '0', ascii('ab')), + + ('ab|cd', 'abcd', '0', ascii('ab')), + ('()ef', 'def', '0,1', ascii(('ef', ''))), + ('*a', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('(*)b', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('$b', 'b', '', ascii(None)), + ('a\\', '-', '', regex.error, self.BAD_ESCAPE), + ('a\\(b', 'a(b', '', ascii(('a(b',))), + ('a\\(*b', 'ab', '0', ascii('ab')), + ('a\\(*b', 'a((b', '0', ascii('a((b')), + ('a\\\\b', 'a\\b', '0', ascii('a\\b')), + + ('abc)', '-', '', regex.error, self.TRAILING_CHARS), + ('(abc', '-', '', regex.error, self.MISSING_RPAREN), + ('((a))', 'abc', '0,1,2', ascii(('a', 'a', 'a'))), + ('(a)b(c)', 'abc', '0,1,2', ascii(('abc', 'a', 'c'))), + ('a+b+c', 'aabbabc', '0', ascii('abc')), + ('a{1,}b{1,}c', 'aabbabc', '0', ascii('abc')), + ('a**', '-', '', regex.error, self.MULTIPLE_REPEAT), + ('a.+?c', 'abcabc', '0', ascii('abc')), + ('(a+|b)*', 'ab', '0,1', ascii(('ab', 'b'))), + ('(a+|b){0,}', 'ab', '0,1', ascii(('ab', 'b'))), + + ('(a+|b)+', 'ab', '0,1', ascii(('ab', 'b'))), + ('(a+|b){1,}', 'ab', '0,1', ascii(('ab', 'b'))), + ('(a+|b)?', 'ab', '0,1', ascii(('a', 'a'))), + ('(a+|b){0,1}', 'ab', '0,1', ascii(('a', 'a'))), + (')(', '-', '', regex.error, self.TRAILING_CHARS), + ('[^ab]*', 'cde', '0', ascii('cde')), + ('abc', '', '', ascii(None)), + ('a*', '', '0', ascii('')), + ('([abc])*d', 'abbbcd', '0,1', ascii(('abbbcd', 'c'))), + ('([abc])*bcd', 'abcd', '0,1', ascii(('abcd', 'a'))), + + ('a|b|c|d|e', 'e', '0', ascii('e')), + ('(a|b|c|d|e)f', 'ef', '0,1', ascii(('ef', 'e'))), + ('abcd*efg', 'abcdefg', '0', ascii('abcdefg')), + ('ab*', 'xabyabbbz', '0', ascii('ab')), + ('ab*', 'xayabbbz', '0', ascii('a')), + ('(ab|cd)e', 'abcde', '0,1', ascii(('cde', 'cd'))), + ('[abhgefdc]ij', 'hij', '0', ascii('hij')), + ('^(ab|cd)e', 'abcde', '', ascii(None)), + ('(abc|)ef', 'abcdef', '0,1', ascii(('ef', ''))), + ('(a|b)c*d', 'abcd', '0,1', ascii(('bcd', 'b'))), + + ('(ab|ab*)bc', 'abc', '0,1', ascii(('abc', 'a'))), + ('a([bc]*)c*', 'abc', '0,1', ascii(('abc', 'bc'))), + ('a([bc]*)(c*d)', 'abcd', '0,1,2', ascii(('abcd', 'bc', 'd'))), + ('a([bc]+)(c*d)', 'abcd', '0,1,2', ascii(('abcd', 'bc', 'd'))), + ('a([bc]*)(c+d)', 'abcd', '0,1,2', ascii(('abcd', 'b', 'cd'))), + ('a[bcd]*dcdcde', 'adcdcde', '0', ascii('adcdcde')), + ('a[bcd]+dcdcde', 'adcdcde', '', ascii(None)), + ('(ab|a)b*c', 'abc', '0,1', ascii(('abc', 'ab'))), + ('((a)(b)c)(d)', 'abcd', '1,2,3,4', ascii(('abc', 'a', 'b', 'd'))), + ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', '0', ascii('alpha')), + + ('^a(bc+|b[eh])g|.h$', 'abh', '0,1', ascii(('bh', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', '0,1,2', ascii(('effgz', + 'effgz', None))), + ('(bc+d$|ef*g.|h?i(j|k))', 'ij', '0,1,2', ascii(('ij', 'ij', + 'j'))), + ('(bc+d$|ef*g.|h?i(j|k))', 'effg', '', ascii(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', '', ascii(None)), + ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', '0,1,2', ascii(('effgz', + 'effgz', None))), + ('((((((((((a))))))))))', 'a', '10', ascii('a')), + ('((((((((((a))))))))))\\10', 'aa', '0', ascii('aa')), + + # Python does not have the same rules for \\41 so this is a syntax error + # ('((((((((((a))))))))))\\41', 'aa', '', ascii(None)), + # ('((((((((((a))))))))))\\41', 'a!', '0', ascii('a!')), + ('((((((((((a))))))))))\\41', '', '', regex.error, + self.INVALID_GROUP_REF), + ('(?i)((((((((((a))))))))))\\41', '', '', regex.error, + self.INVALID_GROUP_REF), + + ('(((((((((a)))))))))', 'a', '0', ascii('a')), + ('multiple words of text', 'uh-uh', '', ascii(None)), + ('multiple words', 'multiple words, yeah', '0', + ascii('multiple words')), + ('(.*)c(.*)', 'abcde', '0,1,2', ascii(('abcde', 'ab', 'de'))), + ('\\((.*), (.*)\\)', '(a, b)', '2,1', ascii(('b', 'a'))), + ('[k]', 'ab', '', ascii(None)), + ('a[-]?c', 'ac', '0', ascii('ac')), + ('(abc)\\1', 'abcabc', '1', ascii('abc')), + ('([a-c]*)\\1', 'abcabc', '1', ascii('abc')), + ('(?i)abc', 'ABC', '0', ascii('ABC')), + + ('(?i)abc', 'XBC', '', ascii(None)), + ('(?i)abc', 'AXC', '', ascii(None)), + ('(?i)abc', 'ABX', '', ascii(None)), + ('(?i)abc', 'XABCY', '0', ascii('ABC')), + ('(?i)abc', 'ABABC', '0', ascii('ABC')), + ('(?i)ab*c', 'ABC', '0', ascii('ABC')), + ('(?i)ab*bc', 'ABC', '0', ascii('ABC')), + ('(?i)ab*bc', 'ABBC', '0', ascii('ABBC')), + ('(?i)ab*?bc', 'ABBBBC', '0', ascii('ABBBBC')), + ('(?i)ab{0,}?bc', 'ABBBBC', '0', ascii('ABBBBC')), + + ('(?i)ab+?bc', 'ABBC', '0', ascii('ABBC')), + ('(?i)ab+bc', 'ABC', '', ascii(None)), + ('(?i)ab+bc', 'ABQ', '', ascii(None)), + ('(?i)ab{1,}bc', 'ABQ', '', ascii(None)), + ('(?i)ab+bc', 'ABBBBC', '0', ascii('ABBBBC')), + ('(?i)ab{1,}?bc', 'ABBBBC', '0', ascii('ABBBBC')), + ('(?i)ab{1,3}?bc', 'ABBBBC', '0', ascii('ABBBBC')), + ('(?i)ab{3,4}?bc', 'ABBBBC', '0', ascii('ABBBBC')), + ('(?i)ab{4,5}?bc', 'ABBBBC', '', ascii(None)), + ('(?i)ab??bc', 'ABBC', '0', ascii('ABBC')), + + ('(?i)ab??bc', 'ABC', '0', ascii('ABC')), + ('(?i)ab{0,1}?bc', 'ABC', '0', ascii('ABC')), + ('(?i)ab??bc', 'ABBBBC', '', ascii(None)), + ('(?i)ab??c', 'ABC', '0', ascii('ABC')), + ('(?i)ab{0,1}?c', 'ABC', '0', ascii('ABC')), + ('(?i)^abc$', 'ABC', '0', ascii('ABC')), + ('(?i)^abc$', 'ABCC', '', ascii(None)), + ('(?i)^abc', 'ABCC', '0', ascii('ABC')), + ('(?i)^abc$', 'AABC', '', ascii(None)), + ('(?i)abc$', 'AABC', '0', ascii('ABC')), + + ('(?i)^', 'ABC', '0', ascii('')), + ('(?i)$', 'ABC', '0', ascii('')), + ('(?i)a.c', 'ABC', '0', ascii('ABC')), + ('(?i)a.c', 'AXC', '0', ascii('AXC')), + ('(?i)a.*?c', 'AXYZC', '0', ascii('AXYZC')), + ('(?i)a.*c', 'AXYZD', '', ascii(None)), + ('(?i)a[bc]d', 'ABC', '', ascii(None)), + ('(?i)a[bc]d', 'ABD', '0', ascii('ABD')), + ('(?i)a[b-d]e', 'ABD', '', ascii(None)), + ('(?i)a[b-d]e', 'ACE', '0', ascii('ACE')), + + ('(?i)a[b-d]', 'AAC', '0', ascii('AC')), + ('(?i)a[-b]', 'A-', '0', ascii('A-')), + ('(?i)a[b-]', 'A-', '0', ascii('A-')), + ('(?i)a[b-a]', '-', '', regex.error, self.BAD_CHAR_RANGE), + ('(?i)a[]b', '-', '', regex.error, self.BAD_SET), + ('(?i)a[', '-', '', regex.error, self.BAD_SET), + ('(?i)a]', 'A]', '0', ascii('A]')), + ('(?i)a[]]b', 'A]B', '0', ascii('A]B')), + ('(?i)a[^bc]d', 'AED', '0', ascii('AED')), + ('(?i)a[^bc]d', 'ABD', '', ascii(None)), + + ('(?i)a[^-b]c', 'ADC', '0', ascii('ADC')), + ('(?i)a[^-b]c', 'A-C', '', ascii(None)), + ('(?i)a[^]b]c', 'A]C', '', ascii(None)), + ('(?i)a[^]b]c', 'ADC', '0', ascii('ADC')), + ('(?i)ab|cd', 'ABC', '0', ascii('AB')), + ('(?i)ab|cd', 'ABCD', '0', ascii('AB')), + ('(?i)()ef', 'DEF', '0,1', ascii(('EF', ''))), + ('(?i)*a', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('(?i)(*)b', '-', '', regex.error, self.NOTHING_TO_REPEAT), + ('(?i)$b', 'B', '', ascii(None)), + + ('(?i)a\\', '-', '', regex.error, self.BAD_ESCAPE), + ('(?i)a\\(b', 'A(B', '', ascii(('A(B',))), + ('(?i)a\\(*b', 'AB', '0', ascii('AB')), + ('(?i)a\\(*b', 'A((B', '0', ascii('A((B')), + ('(?i)a\\\\b', 'A\\B', '0', ascii('A\\B')), + ('(?i)abc)', '-', '', regex.error, self.TRAILING_CHARS), + ('(?i)(abc', '-', '', regex.error, self.MISSING_RPAREN), + ('(?i)((a))', 'ABC', '0,1,2', ascii(('A', 'A', 'A'))), + ('(?i)(a)b(c)', 'ABC', '0,1,2', ascii(('ABC', 'A', 'C'))), + ('(?i)a+b+c', 'AABBABC', '0', ascii('ABC')), + + ('(?i)a{1,}b{1,}c', 'AABBABC', '0', ascii('ABC')), + ('(?i)a**', '-', '', regex.error, self.MULTIPLE_REPEAT), + ('(?i)a.+?c', 'ABCABC', '0', ascii('ABC')), + ('(?i)a.*?c', 'ABCABC', '0', ascii('ABC')), + ('(?i)a.{0,5}?c', 'ABCABC', '0', ascii('ABC')), + ('(?i)(a+|b)*', 'AB', '0,1', ascii(('AB', 'B'))), + ('(?i)(a+|b){0,}', 'AB', '0,1', ascii(('AB', 'B'))), + ('(?i)(a+|b)+', 'AB', '0,1', ascii(('AB', 'B'))), + ('(?i)(a+|b){1,}', 'AB', '0,1', ascii(('AB', 'B'))), + ('(?i)(a+|b)?', 'AB', '0,1', ascii(('A', 'A'))), + + ('(?i)(a+|b){0,1}', 'AB', '0,1', ascii(('A', 'A'))), + ('(?i)(a+|b){0,1}?', 'AB', '0,1', ascii(('', None))), + ('(?i))(', '-', '', regex.error, self.TRAILING_CHARS), + ('(?i)[^ab]*', 'CDE', '0', ascii('CDE')), + ('(?i)abc', '', '', ascii(None)), + ('(?i)a*', '', '0', ascii('')), + ('(?i)([abc])*d', 'ABBBCD', '0,1', ascii(('ABBBCD', 'C'))), + ('(?i)([abc])*bcd', 'ABCD', '0,1', ascii(('ABCD', 'A'))), + ('(?i)a|b|c|d|e', 'E', '0', ascii('E')), + ('(?i)(a|b|c|d|e)f', 'EF', '0,1', ascii(('EF', 'E'))), + + ('(?i)abcd*efg', 'ABCDEFG', '0', ascii('ABCDEFG')), + ('(?i)ab*', 'XABYABBBZ', '0', ascii('AB')), + ('(?i)ab*', 'XAYABBBZ', '0', ascii('A')), + ('(?i)(ab|cd)e', 'ABCDE', '0,1', ascii(('CDE', 'CD'))), + ('(?i)[abhgefdc]ij', 'HIJ', '0', ascii('HIJ')), + ('(?i)^(ab|cd)e', 'ABCDE', '', ascii(None)), + ('(?i)(abc|)ef', 'ABCDEF', '0,1', ascii(('EF', ''))), + ('(?i)(a|b)c*d', 'ABCD', '0,1', ascii(('BCD', 'B'))), + ('(?i)(ab|ab*)bc', 'ABC', '0,1', ascii(('ABC', 'A'))), + ('(?i)a([bc]*)c*', 'ABC', '0,1', ascii(('ABC', 'BC'))), + + ('(?i)a([bc]*)(c*d)', 'ABCD', '0,1,2', ascii(('ABCD', 'BC', 'D'))), + ('(?i)a([bc]+)(c*d)', 'ABCD', '0,1,2', ascii(('ABCD', 'BC', 'D'))), + ('(?i)a([bc]*)(c+d)', 'ABCD', '0,1,2', ascii(('ABCD', 'B', 'CD'))), + ('(?i)a[bcd]*dcdcde', 'ADCDCDE', '0', ascii('ADCDCDE')), + ('(?i)a[bcd]+dcdcde', 'ADCDCDE', '', ascii(None)), + ('(?i)(ab|a)b*c', 'ABC', '0,1', ascii(('ABC', 'AB'))), + ('(?i)((a)(b)c)(d)', 'ABCD', '1,2,3,4', ascii(('ABC', 'A', 'B', + 'D'))), + ('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', '0', ascii('ALPHA')), + ('(?i)^a(bc+|b[eh])g|.h$', 'ABH', '0,1', ascii(('BH', None))), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', '0,1,2', ascii(('EFFGZ', + 'EFFGZ', None))), + + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', '0,1,2', ascii(('IJ', 'IJ', + 'J'))), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', '', ascii(None)), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', '', ascii(None)), + ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', '0,1,2', ascii(('EFFGZ', + 'EFFGZ', None))), + ('(?i)((((((((((a))))))))))', 'A', '10', ascii('A')), + ('(?i)((((((((((a))))))))))\\10', 'AA', '0', ascii('AA')), + #('(?i)((((((((((a))))))))))\\41', 'AA', '', ascii(None)), + #('(?i)((((((((((a))))))))))\\41', 'A!', '0', ascii('A!')), + ('(?i)(((((((((a)))))))))', 'A', '0', ascii('A')), + ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', '1', + ascii('A')), + ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', '1', + ascii('C')), + ('(?i)multiple words of text', 'UH-UH', '', ascii(None)), + + ('(?i)multiple words', 'MULTIPLE WORDS, YEAH', '0', + ascii('MULTIPLE WORDS')), + ('(?i)(.*)c(.*)', 'ABCDE', '0,1,2', ascii(('ABCDE', 'AB', 'DE'))), + ('(?i)\\((.*), (.*)\\)', '(A, B)', '2,1', ascii(('B', 'A'))), + ('(?i)[k]', 'AB', '', ascii(None)), + # ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', ascii(ABCD-$&-\\ABCD)), + # ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', ascii(BC-$1-\\BC)), + ('(?i)a[-]?c', 'AC', '0', ascii('AC')), + ('(?i)(abc)\\1', 'ABCABC', '1', ascii('ABC')), + ('(?i)([a-c]*)\\1', 'ABCABC', '1', ascii('ABC')), + ('a(?!b).', 'abad', '0', ascii('ad')), + ('a(?=d).', 'abad', '0', ascii('ad')), + ('a(?=c|d).', 'abad', '0', ascii('ad')), + + ('a(?:b|c|d)(.)', 'ace', '1', ascii('e')), + ('a(?:b|c|d)*(.)', 'ace', '1', ascii('e')), + ('a(?:b|c|d)+?(.)', 'ace', '1', ascii('e')), + ('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', '1,2', ascii(('c', 'e'))), + + # Lookbehind: split by : but not if it is escaped by -. + ('(?]*?b', 'a>b', '', ascii(None)), + # Bug 490573: minimizing repeat problem. + (r'^a*?$', 'foo', '', ascii(None)), + # Bug 470582: nested groups problem. + (r'^((a)c)?(ab)$', 'ab', '1,2,3', ascii((None, None, 'ab'))), + # Another minimizing repeat problem (capturing groups in assertions). + ('^([ab]*?)(?=(b)?)c', 'abc', '1,2', ascii(('ab', None))), + ('^([ab]*?)(?!(b))c', 'abc', '1,2', ascii(('ab', None))), + ('^([ab]*?)(?(.){0,2})d", "abcd").captures(1), + ['b', 'c']) + self.assertEqual(regex.search(r"(.)+", "a").captures(1), ['a']) + + def test_guards(self): + m = regex.search(r"(X.*?Y\s*){3}(X\s*)+AB:", + "XY\nX Y\nX Y\nXY\nXX AB:") + self.assertEqual(m.span(0, 1, 2), ((3, 21), (12, 15), (16, 18))) + + m = regex.search(r"(X.*?Y\s*){3,}(X\s*)+AB:", + "XY\nX Y\nX Y\nXY\nXX AB:") + self.assertEqual(m.span(0, 1, 2), ((0, 21), (12, 15), (16, 18))) + + m = regex.search(r'\d{4}(\s*\w)?\W*((?!\d)\w){2}', "9999XX") + self.assertEqual(m.span(0, 1, 2), ((0, 6), (-1, -1), (5, 6))) + + m = regex.search(r'A\s*?.*?(\n+.*?\s*?){0,2}\(X', 'A\n1\nS\n1 (X') + self.assertEqual(m.span(0, 1), ((0, 10), (5, 8))) + + m = regex.search(r'Derde\s*:', 'aaaaaa:\nDerde:') + self.assertEqual(m.span(), (8, 14)) + m = regex.search(r'Derde\s*:', 'aaaaa:\nDerde:') + self.assertEqual(m.span(), (7, 13)) + + def test_turkic(self): + # Turkish has dotted and dotless I/i. + pairs = "I=i;I=\u0131;i=\u0130" + + all_chars = set() + matching = set() + for pair in pairs.split(";"): + ch1, ch2 = pair.split("=") + all_chars.update((ch1, ch2)) + matching.add((ch1, ch1)) + matching.add((ch1, ch2)) + matching.add((ch2, ch1)) + matching.add((ch2, ch2)) + + for ch1 in all_chars: + for ch2 in all_chars: + m = regex.match(r"(?i)\A" + ch1 + r"\Z", ch2) + if m: + if (ch1, ch2) not in matching: + self.fail("{} matching {}".format(ascii(ch1), + ascii(ch2))) + else: + if (ch1, ch2) in matching: + self.fail("{} not matching {}".format(ascii(ch1), + ascii(ch2))) + + def test_named_lists(self): + options = ["one", "two", "three"] + self.assertEqual(regex.match(r"333\L444", "333one444", + bar=options).group(), "333one444") + self.assertEqual(regex.match(r"(?i)333\L444", "333TWO444", + bar=options).group(), "333TWO444") + self.assertEqual(regex.match(r"333\L444", "333four444", + bar=options), None) + + options = [b"one", b"two", b"three"] + self.assertEqual(regex.match(br"333\L444", b"333one444", + bar=options).group(), b"333one444") + self.assertEqual(regex.match(br"(?i)333\L444", b"333TWO444", + bar=options).group(), b"333TWO444") + self.assertEqual(regex.match(br"333\L444", b"333four444", + bar=options), None) + + self.assertEqual(repr(type(regex.compile(r"3\L4\L+5", + bar=["one", "two", "three"]))), self.PATTERN_CLASS) + + self.assertEqual(regex.findall(r"^\L", "solid QWERT", + options=set(['good', 'brilliant', '+s\\ol[i}d'])), []) + self.assertEqual(regex.findall(r"^\L", "+solid QWERT", + options=set(['good', 'brilliant', '+solid'])), ['+solid']) + + options = ["STRASSE"] + self.assertEqual(regex.match(r"(?fi)\L", + "stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, + 6)) + + options = ["STRASSE", "stress"] + self.assertEqual(regex.match(r"(?fi)\L", + "stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, + 6)) + + options = ["stra\N{LATIN SMALL LETTER SHARP S}e"] + self.assertEqual(regex.match(r"(?fi)\L", "STRASSE", + words=options).span(), (0, 7)) + + options = ["kit"] + self.assertEqual(regex.search(r"(?i)\L", "SKITS", + words=options).span(), (1, 4)) + self.assertEqual(regex.search(r"(?i)\L", + "SK\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}TS", + words=options).span(), (1, 4)) + + self.assertEqual(regex.search(r"(?fi)\b(\w+) +\1\b", + " stra\N{LATIN SMALL LETTER SHARP S}e STRASSE ").span(), (1, 15)) + self.assertEqual(regex.search(r"(?fi)\b(\w+) +\1\b", + " STRASSE stra\N{LATIN SMALL LETTER SHARP S}e ").span(), (1, 15)) + + self.assertEqual(regex.search(r"^\L$", "", options=[]).span(), + (0, 0)) + + def test_fuzzy(self): + # Some tests borrowed from TRE library tests. + self.assertEqual(repr(type(regex.compile('(fou){s,e<=1}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(fuu){s}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(fuu){s,e}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(anaconda){1i+1d<1,s<=1}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(anaconda){1i+1d<1,s<=1,e<=10}'))), + self.PATTERN_CLASS) + self.assertEqual(repr(type(regex.compile('(anaconda){s<=1,e<=1,1i+1d<1}'))), + self.PATTERN_CLASS) + + text = 'molasses anaconda foo bar baz smith anderson ' + self.assertEqual(regex.search('(znacnda){s<=1,e<=3,1i+1d<1}', text), + None) + self.assertEqual(regex.search('(znacnda){s<=1,e<=3,1i+1d<2}', + text).span(0, 1), ((9, 17), (9, 17))) + self.assertEqual(regex.search('(ananda){1i+1d<2}', text), None) + self.assertEqual(regex.search(r"(?:\bznacnda){e<=2}", text)[0], + "anaconda") + self.assertEqual(regex.search(r"(?:\bnacnda){e<=2}", text)[0], + "anaconda") + + text = 'anaconda foo bar baz smith anderson' + self.assertEqual(regex.search('(fuu){i<=3,d<=3,e<=5}', text).span(0, + 1), ((0, 0), (0, 0))) + self.assertEqual(regex.search('(?b)(fuu){i<=3,d<=3,e<=5}', + text).span(0, 1), ((9, 10), (9, 10))) + self.assertEqual(regex.search('(fuu){i<=2,d<=2,e<=5}', text).span(0, + 1), ((7, 10), (7, 10))) + self.assertEqual(regex.search('(?e)(fuu){i<=2,d<=2,e<=5}', + text).span(0, 1), ((9, 10), (9, 10))) + self.assertEqual(regex.search('(fuu){i<=3,d<=3,e}', text).span(0, 1), + ((0, 0), (0, 0))) + self.assertEqual(regex.search('(?b)(fuu){i<=3,d<=3,e}', text).span(0, + 1), ((9, 10), (9, 10))) + + self.assertEqual(repr(type(regex.compile('(approximate){s<=3,1i+1d<3}'))), + self.PATTERN_CLASS) + + # No cost limit. + self.assertEqual(regex.search('(foobar){e}', + 'xirefoabralfobarxie').span(0, 1), ((0, 6), (0, 6))) + self.assertEqual(regex.search('(?e)(foobar){e}', + 'xirefoabralfobarxie').span(0, 1), ((0, 3), (0, 3))) + self.assertEqual(regex.search('(?b)(foobar){e}', + 'xirefoabralfobarxie').span(0, 1), ((11, 16), (11, 16))) + + # At most two errors. + self.assertEqual(regex.search('(foobar){e<=2}', + 'xirefoabrzlfd').span(0, 1), ((4, 9), (4, 9))) + self.assertEqual(regex.search('(foobar){e<=2}', 'xirefoabzlfd'), None) + + # At most two inserts or substitutions and max two errors total. + self.assertEqual(regex.search('(foobar){i<=2,s<=2,e<=2}', + 'oobargoobaploowap').span(0, 1), ((5, 11), (5, 11))) + + # Find best whole word match for "foobar". + self.assertEqual(regex.search('\\b(foobar){e}\\b', 'zfoobarz').span(0, + 1), ((0, 8), (0, 8))) + self.assertEqual(regex.search('\\b(foobar){e}\\b', + 'boing zfoobarz goobar woop').span(0, 1), ((0, 6), (0, 6))) + self.assertEqual(regex.search('(?b)\\b(foobar){e}\\b', + 'boing zfoobarz goobar woop').span(0, 1), ((15, 21), (15, 21))) + + # Match whole string, allow only 1 error. + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobar').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoobar').span(0, + 1), ((0, 7), (0, 7))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobarx').span(0, + 1), ((0, 7), (0, 7))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'fooxbar').span(0, + 1), ((0, 7), (0, 7))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foxbar').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xoobar').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobax').span(0, 1), + ((0, 6), (0, 6))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'oobar').span(0, 1), + ((0, 5), (0, 5))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'fobar').span(0, 1), + ((0, 5), (0, 5))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'fooba').span(0, 1), + ((0, 5), (0, 5))) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoobarx'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobarxx'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xxfoobar'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoxbar'), None) + self.assertEqual(regex.search('^(foobar){e<=1}$', 'foxbarx'), None) + + # At most one insert, two deletes, and three substitutions. + # Additionally, deletes cost two and substitutes one, and total + # cost must be less than 4. + self.assertEqual(regex.search('(foobar){i<=1,d<=2,s<=3,2d+1s<4}', + '3oifaowefbaoraofuiebofasebfaobfaorfeoaro').span(0, 1), ((6, 13), (6, + 13))) + self.assertEqual(regex.search('(?b)(foobar){i<=1,d<=2,s<=3,2d+1s<4}', + '3oifaowefbaoraofuiebofasebfaobfaorfeoaro').span(0, 1), ((34, 39), + (34, 39))) + + # Partially fuzzy matches. + self.assertEqual(regex.search('foo(bar){e<=1}zap', 'foobarzap').span(0, + 1), ((0, 9), (3, 6))) + self.assertEqual(regex.search('foo(bar){e<=1}zap', 'fobarzap'), None) + self.assertEqual(regex.search('foo(bar){e<=1}zap', 'foobrzap').span(0, + 1), ((0, 8), (3, 5))) + + text = ('www.cnn.com 64.236.16.20\nwww.slashdot.org 66.35.250.150\n' + 'For useful information, use www.slashdot.org\nthis is demo data!\n') + self.assertEqual(regex.search(r'(?s)^.*(dot.org){e}.*$', text).span(0, + 1), ((0, 120), (120, 120))) + self.assertEqual(regex.search(r'(?es)^.*(dot.org){e}.*$', text).span(0, + 1), ((0, 120), (93, 100))) + self.assertEqual(regex.search(r'^.*(dot.org){e}.*$', text).span(0, 1), + ((0, 119), (24, 101))) + + # Behaviour is unexpected, but arguably not wrong. It first finds the + # best match, then the best in what follows, etc. + self.assertEqual(regex.findall(r"\b\L{e<=1}\b", + " book cot dog desk ", words="cat dog".split()), ["cot", "dog"]) + self.assertEqual(regex.findall(r"\b\L{e<=1}\b", + " book dog cot desk ", words="cat dog".split()), [" dog", "cot"]) + self.assertEqual(regex.findall(r"(?e)\b\L{e<=1}\b", + " book dog cot desk ", words="cat dog".split()), ["dog", "cot"]) + self.assertEqual(regex.findall(r"(?r)\b\L{e<=1}\b", + " book cot dog desk ", words="cat dog".split()), ["dog ", "cot"]) + self.assertEqual(regex.findall(r"(?er)\b\L{e<=1}\b", + " book cot dog desk ", words="cat dog".split()), ["dog", "cot"]) + self.assertEqual(regex.findall(r"(?r)\b\L{e<=1}\b", + " book dog cot desk ", words="cat dog".split()), ["cot", "dog"]) + self.assertEqual(regex.findall(br"\b\L{e<=1}\b", + b" book cot dog desk ", words=b"cat dog".split()), [b"cot", b"dog"]) + self.assertEqual(regex.findall(br"\b\L{e<=1}\b", + b" book dog cot desk ", words=b"cat dog".split()), [b" dog", b"cot"]) + self.assertEqual(regex.findall(br"(?e)\b\L{e<=1}\b", + b" book dog cot desk ", words=b"cat dog".split()), [b"dog", b"cot"]) + self.assertEqual(regex.findall(br"(?r)\b\L{e<=1}\b", + b" book cot dog desk ", words=b"cat dog".split()), [b"dog ", b"cot"]) + self.assertEqual(regex.findall(br"(?er)\b\L{e<=1}\b", + b" book cot dog desk ", words=b"cat dog".split()), [b"dog", b"cot"]) + self.assertEqual(regex.findall(br"(?r)\b\L{e<=1}\b", + b" book dog cot desk ", words=b"cat dog".split()), [b"cot", b"dog"]) + + self.assertEqual(regex.search(r"(\w+) (\1{e<=1})", "foo fou").groups(), + ("foo", "fou")) + self.assertEqual(regex.search(r"(?r)(\2{e<=1}) (\w+)", + "foo fou").groups(), ("foo", "fou")) + self.assertEqual(regex.search(br"(\w+) (\1{e<=1})", + b"foo fou").groups(), (b"foo", b"fou")) + + self.assertEqual(regex.findall(r"(?:(?:QR)+){e}", "abcde"), ["abcde", + ""]) + self.assertEqual(regex.findall(r"(?:Q+){e}", "abc"), ["abc", ""]) + + # Hg issue 41: = for fuzzy matches + self.assertEqual(regex.match(r"(?:service detection){0[^()]+)|(?R))*\)", "(ab(cd)ef)")[ + : ], ("(ab(cd)ef)", "ef")) + self.assertEqual(regex.search(r"\(((?>[^()]+)|(?R))*\)", + "(ab(cd)ef)").captures(1), ["ab", "cd", "(cd)", "ef"]) + + self.assertEqual(regex.search(r"(?r)\(((?R)|(?>[^()]+))*\)", + "(ab(cd)ef)")[ : ], ("(ab(cd)ef)", "ab")) + self.assertEqual(regex.search(r"(?r)\(((?R)|(?>[^()]+))*\)", + "(ab(cd)ef)").captures(1), ["ef", "cd", "(cd)", "ab"]) + + self.assertEqual(regex.search(r"\(([^()]+|(?R))*\)", + "some text (a(b(c)d)e) more text")[ : ], ("(a(b(c)d)e)", "e")) + + self.assertEqual(regex.search(r"(?r)\(((?R)|[^()]+)*\)", + "some text (a(b(c)d)e) more text")[ : ], ("(a(b(c)d)e)", "a")) + + self.assertEqual(regex.search(r"(foo(\(((?:(?>[^()]+)|(?2))*)\)))", + "foo(bar(baz)+baz(bop))")[ : ], ("foo(bar(baz)+baz(bop))", + "foo(bar(baz)+baz(bop))", "(bar(baz)+baz(bop))", + "bar(baz)+baz(bop)")) + + self.assertEqual(regex.search(r"(?r)(foo(\(((?:(?2)|(?>[^()]+))*)\)))", + "foo(bar(baz)+baz(bop))")[ : ], ("foo(bar(baz)+baz(bop))", + "foo(bar(baz)+baz(bop))", "(bar(baz)+baz(bop))", + "bar(baz)+baz(bop)")) + + rgx = regex.compile(r"""^\s*(<\s*([a-zA-Z:]+)(?:\s*[a-zA-Z:]*\s*=\s*(?:'[^']*'|"[^"]*"))*\s*(/\s*)?>(?:[^<>]*|(?1))*(?(3)|<\s*/\s*\2\s*>))\s*$""") + self.assertEqual(bool(rgx.search('')), True) + self.assertEqual(bool(rgx.search('')), False) + self.assertEqual(bool(rgx.search('')), True) + self.assertEqual(bool(rgx.search('')), False) + self.assertEqual(bool(rgx.search('')), False) + + self.assertEqual(bool(rgx.search('')), False) + self.assertEqual(bool(rgx.search('')), True) + self.assertEqual(bool(rgx.search('< fooo / >')), True) + # The next regex should and does match. Perl 5.14 agrees. + #self.assertEqual(bool(rgx.search('foo')), False) + self.assertEqual(bool(rgx.search('foo')), False) + + self.assertEqual(bool(rgx.search('foo')), True) + self.assertEqual(bool(rgx.search('foo')), True) + self.assertEqual(bool(rgx.search('')), True) + + def test_copy(self): + # PatternObjects are immutable, therefore there's no need to clone them. + r = regex.compile("a") + self.assertTrue(copy.copy(r) is r) + self.assertTrue(copy.deepcopy(r) is r) + + # MatchObjects are normally mutable because the target string can be + # detached. However, after the target string has been detached, a + # MatchObject becomes immutable, so there's no need to clone it. + m = r.match("a") + self.assertTrue(copy.copy(m) is not m) + self.assertTrue(copy.deepcopy(m) is not m) + + self.assertTrue(m.string is not None) + m2 = copy.copy(m) + m2.detach_string() + self.assertTrue(m.string is not None) + self.assertTrue(m2.string is None) + + # The following behaviour matches that of the re module. + it = regex.finditer(".", "ab") + it2 = copy.copy(it) + self.assertEqual(next(it).group(), "a") + self.assertEqual(next(it2).group(), "b") + + # The following behaviour matches that of the re module. + it = regex.finditer(".", "ab") + it2 = copy.deepcopy(it) + self.assertEqual(next(it).group(), "a") + self.assertEqual(next(it2).group(), "b") + + # The following behaviour is designed to match that of copying 'finditer'. + it = regex.splititer(" ", "a b") + it2 = copy.copy(it) + self.assertEqual(next(it), "a") + self.assertEqual(next(it2), "b") + + # The following behaviour is designed to match that of copying 'finditer'. + it = regex.splititer(" ", "a b") + it2 = copy.deepcopy(it) + self.assertEqual(next(it), "a") + self.assertEqual(next(it2), "b") + + def test_format(self): + self.assertEqual(regex.subf(r"(\w+) (\w+)", "{0} => {2} {1}", + "foo bar"), "foo bar => bar foo") + self.assertEqual(regex.subf(r"(?\w+) (?\w+)", + "{word2} {word1}", "foo bar"), "bar foo") + + self.assertEqual(regex.subfn(r"(\w+) (\w+)", "{0} => {2} {1}", + "foo bar"), ("foo bar => bar foo", 1)) + self.assertEqual(regex.subfn(r"(?\w+) (?\w+)", + "{word2} {word1}", "foo bar"), ("bar foo", 1)) + + self.assertEqual(regex.match(r"(\w+) (\w+)", + "foo bar").expandf("{0} => {2} {1}"), "foo bar => bar foo") + + def test_fullmatch(self): + self.assertEqual(bool(regex.fullmatch(r"abc", "abc")), True) + self.assertEqual(bool(regex.fullmatch(r"abc", "abcx")), False) + self.assertEqual(bool(regex.fullmatch(r"abc", "abcx", endpos=3)), True) + + self.assertEqual(bool(regex.fullmatch(r"abc", "xabc", pos=1)), True) + self.assertEqual(bool(regex.fullmatch(r"abc", "xabcy", pos=1)), False) + self.assertEqual(bool(regex.fullmatch(r"abc", "xabcy", pos=1, + endpos=4)), True) + + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abc")), True) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abcx")), False) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abcx", endpos=3)), + True) + + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabc", pos=1)), + True) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabcy", pos=1)), + False) + self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabcy", pos=1, + endpos=4)), True) + + def test_issue_18468(self): + self.assertTypedEqual(regex.sub('y', 'a', 'xyz'), 'xaz') + self.assertTypedEqual(regex.sub('y', StrSubclass('a'), + StrSubclass('xyz')), 'xaz') + self.assertTypedEqual(regex.sub(b'y', b'a', b'xyz'), b'xaz') + self.assertTypedEqual(regex.sub(b'y', BytesSubclass(b'a'), + BytesSubclass(b'xyz')), b'xaz') + self.assertTypedEqual(regex.sub(b'y', bytearray(b'a'), + bytearray(b'xyz')), b'xaz') + self.assertTypedEqual(regex.sub(b'y', memoryview(b'a'), + memoryview(b'xyz')), b'xaz') + + for string in ":a:b::c", StrSubclass(":a:b::c"): + self.assertTypedEqual(regex.split(":", string), ['', 'a', 'b', '', + 'c']) + if sys.version_info >= (3, 7, 0): + self.assertTypedEqual(regex.split(":*", string), ['', '', 'a', + '', 'b', '', 'c', '']) + self.assertTypedEqual(regex.split("(:*)", string), ['', ':', + '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']) + else: + self.assertTypedEqual(regex.split(":*", string), ['', 'a', 'b', + 'c']) + self.assertTypedEqual(regex.split("(:*)", string), ['', ':', + 'a', ':', 'b', '::', 'c']) + + for string in (b":a:b::c", BytesSubclass(b":a:b::c"), + bytearray(b":a:b::c"), memoryview(b":a:b::c")): + self.assertTypedEqual(regex.split(b":", string), [b'', b'a', b'b', + b'', b'c']) + if sys.version_info >= (3, 7, 0): + self.assertTypedEqual(regex.split(b":*", string), [b'', b'', + b'a', b'', b'b', b'', b'c', b'']) + self.assertTypedEqual(regex.split(b"(:*)", string), [b'', b':', + b'', b'', b'a', b':', b'', b'', b'b', b'::', b'', b'', b'c', + b'', b'']) + else: + self.assertTypedEqual(regex.split(b":*", string), [b'', b'a', + b'b', b'c']) + self.assertTypedEqual(regex.split(b"(:*)", string), [b'', b':', + b'a', b':', b'b', b'::', b'c']) + + for string in "a:b::c:::d", StrSubclass("a:b::c:::d"): + self.assertTypedEqual(regex.findall(":+", string), [":", "::", + ":::"]) + self.assertTypedEqual(regex.findall("(:+)", string), [":", "::", + ":::"]) + self.assertTypedEqual(regex.findall("(:)(:*)", string), [(":", ""), + (":", ":"), (":", "::")]) + + for string in (b"a:b::c:::d", BytesSubclass(b"a:b::c:::d"), + bytearray(b"a:b::c:::d"), memoryview(b"a:b::c:::d")): + self.assertTypedEqual(regex.findall(b":+", string), [b":", b"::", + b":::"]) + self.assertTypedEqual(regex.findall(b"(:+)", string), [b":", b"::", + b":::"]) + self.assertTypedEqual(regex.findall(b"(:)(:*)", string), [(b":", + b""), (b":", b":"), (b":", b"::")]) + + for string in 'a', StrSubclass('a'): + self.assertEqual(regex.match('a', string).groups(), ()) + self.assertEqual(regex.match('(a)', string).groups(), ('a',)) + self.assertEqual(regex.match('(a)', string).group(0), 'a') + self.assertEqual(regex.match('(a)', string).group(1), 'a') + self.assertEqual(regex.match('(a)', string).group(1, 1), ('a', + 'a')) + + for string in (b'a', BytesSubclass(b'a'), bytearray(b'a'), + memoryview(b'a')): + self.assertEqual(regex.match(b'a', string).groups(), ()) + self.assertEqual(regex.match(b'(a)', string).groups(), (b'a',)) + self.assertEqual(regex.match(b'(a)', string).group(0), b'a') + self.assertEqual(regex.match(b'(a)', string).group(1), b'a') + self.assertEqual(regex.match(b'(a)', string).group(1, 1), (b'a', + b'a')) + + def test_partial(self): + self.assertEqual(regex.match('ab', 'a', partial=True).partial, True) + self.assertEqual(regex.match('ab', 'a', partial=True).span(), (0, 1)) + self.assertEqual(regex.match(r'cats', 'cat', partial=True).partial, + True) + self.assertEqual(regex.match(r'cats', 'cat', partial=True).span(), (0, + 3)) + self.assertEqual(regex.match(r'cats', 'catch', partial=True), None) + self.assertEqual(regex.match(r'abc\w{3}', 'abcdef', + partial=True).partial, False) + self.assertEqual(regex.match(r'abc\w{3}', 'abcdef', + partial=True).span(), (0, 6)) + self.assertEqual(regex.match(r'abc\w{3}', 'abcde', + partial=True).partial, True) + self.assertEqual(regex.match(r'abc\w{3}', 'abcde', + partial=True).span(), (0, 5)) + + self.assertEqual(regex.match(r'\d{4}$', '1234', partial=True).partial, + False) + + self.assertEqual(regex.match(r'\L', 'post', partial=True, + words=['post']).partial, False) + self.assertEqual(regex.match(r'\L', 'post', partial=True, + words=['post']).span(), (0, 4)) + self.assertEqual(regex.match(r'\L', 'pos', partial=True, + words=['post']).partial, True) + self.assertEqual(regex.match(r'\L', 'pos', partial=True, + words=['post']).span(), (0, 3)) + + self.assertEqual(regex.match(r'(?fi)\L', 'POST', partial=True, + words=['po\uFB06']).partial, False) + self.assertEqual(regex.match(r'(?fi)\L', 'POST', partial=True, + words=['po\uFB06']).span(), (0, 4)) + self.assertEqual(regex.match(r'(?fi)\L', 'POS', partial=True, + words=['po\uFB06']).partial, True) + self.assertEqual(regex.match(r'(?fi)\L', 'POS', partial=True, + words=['po\uFB06']).span(), (0, 3)) + self.assertEqual(regex.match(r'(?fi)\L', 'po\uFB06', + partial=True, words=['POS']), None) + + self.assertEqual(regex.match(r'[a-z]*4R$', 'a', partial=True).span(), + (0, 1)) + self.assertEqual(regex.match(r'[a-z]*4R$', 'ab', partial=True).span(), + (0, 2)) + self.assertEqual(regex.match(r'[a-z]*4R$', 'ab4', partial=True).span(), + (0, 3)) + self.assertEqual(regex.match(r'[a-z]*4R$', 'a4', partial=True).span(), + (0, 2)) + self.assertEqual(regex.match(r'[a-z]*4R$', 'a4R', partial=True).span(), + (0, 3)) + self.assertEqual(regex.match(r'[a-z]*4R$', '4a', partial=True), None) + self.assertEqual(regex.match(r'[a-z]*4R$', 'a44', partial=True), None) + + def test_hg_bugs(self): + # Hg issue 28: regex.compile("(?>b)") causes "TypeError: 'Character' + # object is not subscriptable" + self.assertEqual(bool(regex.compile("(?>b)", flags=regex.V1)), True) + + # Hg issue 29: regex.compile("^((?>\w+)|(?>\s+))*$") causes + # "TypeError: 'GreedyRepeat' object is not iterable" + self.assertEqual(bool(regex.compile(r"^((?>\w+)|(?>\s+))*$", + flags=regex.V1)), True) + + # Hg issue 31: atomic and normal groups in recursive patterns + self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", + "a(bcd(e)f)g(h)"), ['(bcd(e)f)', '(h)']) + self.assertEqual(regex.findall(r"\((?:(?:[^()]+)|(?R))*\)", + "a(bcd(e)f)g(h)"), ['(bcd(e)f)', '(h)']) + self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", + "a(b(cd)e)f)g)h"), ['(b(cd)e)']) + self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", + "a(bc(d(e)f)gh"), ['(d(e)f)']) + self.assertEqual(regex.findall(r"(?r)\((?:(?>[^()]+)|(?R))*\)", + "a(bc(d(e)f)gh"), ['(d(e)f)']) + self.assertEqual([m.group() for m in + regex.finditer(r"\((?:[^()]*+|(?0))*\)", "a(b(c(de)fg)h")], + ['(c(de)fg)']) + + # Hg issue 32: regex.search("a(bc)d", "abcd", regex.I|regex.V1) returns + # None + self.assertEqual(regex.search("a(bc)d", "abcd", regex.I | + regex.V1).group(0), "abcd") + + # Hg issue 33: regex.search("([\da-f:]+)$", "E", regex.I|regex.V1) + # returns None + self.assertEqual(regex.search(r"([\da-f:]+)$", "E", regex.I | + regex.V1).group(0), "E") + self.assertEqual(regex.search(r"([\da-f:]+)$", "e", regex.I | + regex.V1).group(0), "e") + + # Hg issue 34: regex.search("^(?=ab(de))(abd)(e)", "abde").groups() + # returns (None, 'abd', 'e') instead of ('de', 'abd', 'e') + self.assertEqual(regex.search("^(?=ab(de))(abd)(e)", "abde").groups(), + ('de', 'abd', 'e')) + + # Hg issue 35: regex.compile("\ ", regex.X) causes "_regex_core.error: + # bad escape" + self.assertEqual(bool(regex.match(r"\ ", " ", flags=regex.X)), True) + + # Hg issue 36: regex.search("^(a|)\1{2}b", "b") returns None + self.assertEqual(regex.search(r"^(a|)\1{2}b", "b").group(0, 1), ('b', + '')) + + # Hg issue 37: regex.search("^(a){0,0}", "abc").group(0,1) returns + # ('a', 'a') instead of ('', None) + self.assertEqual(regex.search("^(a){0,0}", "abc").group(0, 1), ('', + None)) + + # Hg issue 38: regex.search("(?>.*/)b", "a/b") returns None + self.assertEqual(regex.search("(?>.*/)b", "a/b").group(0), "a/b") + + # Hg issue 39: regex.search("((?i)blah)\\s+\\1", "blah BLAH") doesn't + # return None + # Changed to positional flags in regex 2023.12.23. + self.assertEqual(regex.search(r"((?i)blah)\s+\1", "blah BLAH"), None) + + # Hg issue 40: regex.search("(\()?[^()]+(?(1)\)|)", "(abcd").group(0) + # returns "bcd" instead of "abcd" + self.assertEqual(regex.search(r"(\()?[^()]+(?(1)\)|)", + "(abcd").group(0), "abcd") + + # Hg issue 42: regex.search("(a*)*", "a", flags=regex.V1).span(1) + # returns (0, 1) instead of (1, 1) + self.assertEqual(regex.search("(a*)*", "a").span(1), (1, 1)) + self.assertEqual(regex.search("(a*)*", "aa").span(1), (2, 2)) + self.assertEqual(regex.search("(a*)*", "aaa").span(1), (3, 3)) + + # Hg issue 43: regex.compile("a(?#xxx)*") causes "_regex_core.error: + # nothing to repeat" + self.assertEqual(regex.search("a(?#xxx)*", "aaa").group(), "aaa") + + # Hg issue 44: regex.compile("(?=abc){3}abc") causes + # "_regex_core.error: nothing to repeat" + self.assertEqual(regex.search("(?=abc){3}abc", "abcabcabc").span(), (0, + 3)) + + # Hg issue 45: regex.compile("^(?:a(?:(?:))+)+") causes + # "_regex_core.error: nothing to repeat" + self.assertEqual(regex.search("^(?:a(?:(?:))+)+", "a").span(), (0, 1)) + self.assertEqual(regex.search("^(?:a(?:(?:))+)+", "aa").span(), (0, 2)) + + # Hg issue 46: regex.compile("a(?x: b c )d") causes + # "_regex_core.error: missing )" + self.assertEqual(regex.search("a(?x: b c )d", "abcd").group(0), "abcd") + + # Hg issue 47: regex.compile("a#comment\n*", flags=regex.X) causes + # "_regex_core.error: nothing to repeat" + self.assertEqual(regex.search("a#comment\n*", "aaa", + flags=regex.X).group(0), "aaa") + + # Hg issue 48: regex.search("(a(?(1)\\1)){4}", "a"*10, + # flags=regex.V1).group(0,1) returns ('aaaaa', 'a') instead of ('aaaaaaaaaa', 'aaaa') + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){1}", + "aaaaaaaaaa").span(0, 1), ((0, 1), (0, 1))) + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){2}", + "aaaaaaaaaa").span(0, 1), ((0, 3), (1, 3))) + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){3}", + "aaaaaaaaaa").span(0, 1), ((0, 6), (3, 6))) + self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){4}", + "aaaaaaaaaa").span(0, 1), ((0, 10), (6, 10))) + + # Hg issue 49: regex.search("(a)(?<=b(?1))", "baz", regex.V1) returns + # None incorrectly + self.assertEqual(regex.search("(?V1)(a)(?<=b(?1))", "baz").group(0), + "a") + + # Hg issue 50: not all keywords are found by named list with + # overlapping keywords when full Unicode casefolding is required + self.assertEqual(regex.findall(r'(?fi)\L', + 'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05', + keywords=['post','pos']), ['POST', 'Post', 'post', 'po\u017Ft', + 'po\uFB06', 'po\uFB05']) + self.assertEqual(regex.findall(r'(?fi)pos|post', + 'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), ['POS', + 'Pos', 'pos', 'po\u017F', 'po\uFB06', 'po\uFB05']) + self.assertEqual(regex.findall(r'(?fi)post|pos', + 'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), ['POST', + 'Post', 'post', 'po\u017Ft', 'po\uFB06', 'po\uFB05']) + self.assertEqual(regex.findall(r'(?fi)post|another', + 'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), ['POST', + 'Post', 'post', 'po\u017Ft', 'po\uFB06', 'po\uFB05']) + + # Hg issue 51: regex.search("((a)(?1)|(?2))", "a", flags=regex.V1) + # returns None incorrectly + self.assertEqual(regex.search("(?V1)((a)(?1)|(?2))", "a").group(0, 1, + 2), ('a', 'a', None)) + + # Hg issue 52: regex.search("(\\1xx|){6}", "xx", + # flags=regex.V1).span(0,1) returns incorrect value + self.assertEqual(regex.search(r"(?V1)(\1xx|){6}", "xx").span(0, 1), + ((0, 2), (2, 2))) + + # Hg issue 53: regex.search("(a|)+", "a") causes MemoryError + self.assertEqual(regex.search("(a|)+", "a").group(0, 1), ("a", "")) + + # Hg issue 54: regex.search("(a|)*\\d", "a"*80) causes MemoryError + self.assertEqual(regex.search(r"(a|)*\d", "a" * 80), None) + + # Hg issue 55: regex.search("^(?:a?b?)*$", "ac") take a very long time. + self.assertEqual(regex.search("^(?:a?b?)*$", "ac"), None) + + # Hg issue 58: bad named character escape sequences like "\\N{1}" + # treats as "N" + self.assertRaisesRegex(regex.error, self.UNDEF_CHAR_NAME, lambda: + regex.compile("\\N{1}")) + + # Hg issue 59: regex.search("\\Z", "a\na\n") returns None incorrectly + self.assertEqual(regex.search("\\Z", "a\na\n").span(0), (4, 4)) + + # Hg issue 60: regex.search("(q1|.)*(q2|.)*(x(a|bc)*y){2,}", "xayxay") + # returns None incorrectly + self.assertEqual(regex.search("(q1|.)*(q2|.)*(x(a|bc)*y){2,}", + "xayxay").group(0), "xayxay") + + # Hg issue 61: regex.search("[^a]", "A", regex.I).group(0) returns '' + # incorrectly + self.assertEqual(regex.search("(?i)[^a]", "A"), None) + + # Hg issue 63: regex.search("[[:ascii:]]", "\N{KELVIN SIGN}", + # flags=regex.I|regex.V1) doesn't return None + self.assertEqual(regex.search("(?i)[[:ascii:]]", "\N{KELVIN SIGN}"), + None) + + # Hg issue 66: regex.search("((a|b(?1)c){3,5})", "baaaaca", + # flags=regex.V1).groups() returns ('baaaac', 'baaaac') instead of ('aaaa', 'a') + self.assertEqual(regex.search("((a|b(?1)c){3,5})", "baaaaca").group(0, + 1, 2), ('aaaa', 'aaaa', 'a')) + + # Hg issue 71: non-greedy quantifier in lookbehind + self.assertEqual(regex.findall(r"(?<=:\S+ )\w+", ":9 abc :10 def"), + ['abc', 'def']) + self.assertEqual(regex.findall(r"(?<=:\S* )\w+", ":9 abc :10 def"), + ['abc', 'def']) + self.assertEqual(regex.findall(r"(?<=:\S+? )\w+", ":9 abc :10 def"), + ['abc', 'def']) + self.assertEqual(regex.findall(r"(?<=:\S*? )\w+", ":9 abc :10 def"), + ['abc', 'def']) + + # Hg issue 73: conditional patterns + self.assertEqual(regex.search(r"(?:fe)?male", "female").group(), + "female") + self.assertEqual([m.group() for m in + regex.finditer(r"(fe)?male: h(?(1)(er)|(is)) (\w+)", + "female: her dog; male: his cat. asdsasda")], ['female: her dog', + 'male: his cat']) + + # Hg issue 78: "Captures" doesn't work for recursive calls + self.assertEqual(regex.search(r'(?\((?:[^()]++|(?&rec))*\))', + 'aaa(((1+0)+1)+1)bbb').captures('rec'), ['(1+0)', '((1+0)+1)', + '(((1+0)+1)+1)']) + + # Hg issue 80: Escape characters throws an exception + self.assertRaisesRegex(regex.error, self.BAD_ESCAPE, lambda: + regex.sub('x', '\\', 'x'), ) + + # Hg issue 82: error range does not work + fz = "(CAGCCTCCCATTTCAGAATATACATCC){1a(?b))', "ab").spans("x"), [(1, + 2), (0, 2)]) + + # Hg issue 91: match.expand is extremely slow + # Check that the replacement cache works. + self.assertEqual(regex.sub(r'(-)', lambda m: m.expand(r'x'), 'a-b-c'), + 'axbxc') + + # Hg issue 94: Python crashes when executing regex updates + # pattern.findall + rx = regex.compile(r'\bt(est){i<2}', flags=regex.V1) + self.assertEqual(rx.search("Some text"), None) + self.assertEqual(rx.findall("Some text"), []) + + # Hg issue 95: 'pos' for regex.error + self.assertRaisesRegex(regex.error, self.MULTIPLE_REPEAT, lambda: + regex.compile(r'.???')) + + # Hg issue 97: behaviour of regex.escape's special_only is wrong + # + # Hg issue 244: Make `special_only=True` the default in + # `regex.escape()` + self.assertEqual(regex.escape('foo!?', special_only=False), 'foo\\!\\?') + self.assertEqual(regex.escape('foo!?', special_only=True), 'foo!\\?') + self.assertEqual(regex.escape('foo!?'), 'foo!\\?') + + self.assertEqual(regex.escape(b'foo!?', special_only=False), b'foo\\!\\?') + self.assertEqual(regex.escape(b'foo!?', special_only=True), + b'foo!\\?') + self.assertEqual(regex.escape(b'foo!?'), b'foo!\\?') + + # Hg issue 100: strange results from regex.search + self.assertEqual(regex.search('^([^z]*(?:WWWi|W))?$', + 'WWWi').groups(), ('WWWi', )) + self.assertEqual(regex.search('^([^z]*(?:WWWi|w))?$', + 'WWWi').groups(), ('WWWi', )) + self.assertEqual(regex.search('^([^z]*?(?:WWWi|W))?$', + 'WWWi').groups(), ('WWWi', )) + + # Hg issue 101: findall() broken (seems like memory corruption) + pat = regex.compile(r'xxx', flags=regex.FULLCASE | regex.UNICODE) + self.assertEqual([x.group() for x in pat.finditer('yxxx')], ['xxx']) + self.assertEqual(pat.findall('yxxx'), ['xxx']) + + raw = 'yxxx' + self.assertEqual([x.group() for x in pat.finditer(raw)], ['xxx']) + self.assertEqual(pat.findall(raw), ['xxx']) + + pat = regex.compile(r'xxx', flags=regex.FULLCASE | regex.IGNORECASE | + regex.UNICODE) + self.assertEqual([x.group() for x in pat.finditer('yxxx')], ['xxx']) + self.assertEqual(pat.findall('yxxx'), ['xxx']) + + raw = 'yxxx' + self.assertEqual([x.group() for x in pat.finditer(raw)], ['xxx']) + self.assertEqual(pat.findall(raw), ['xxx']) + + # Hg issue 106: * operator not working correctly with sub() + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.sub('(?V0).*', 'x', 'test'), 'xx') + else: + self.assertEqual(regex.sub('(?V0).*', 'x', 'test'), 'x') + self.assertEqual(regex.sub('(?V1).*', 'x', 'test'), 'xx') + + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.sub('(?V0).*?', '|', 'test'), '|||||||||') + else: + self.assertEqual(regex.sub('(?V0).*?', '|', 'test'), '|t|e|s|t|') + self.assertEqual(regex.sub('(?V1).*?', '|', 'test'), '|||||||||') + + # Hg issue 112: re: OK, but regex: SystemError + self.assertEqual(regex.sub(r'^(@)\n(?!.*?@)(.*)', + r'\1\n==========\n\2', '@\n', flags=regex.DOTALL), '@\n==========\n') + + # Hg issue 109: Edit distance of fuzzy match + self.assertEqual(regex.match(r'(?:cats|cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats|cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats|cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + + self.assertEqual(regex.match(r'(?:cat){e<=1}', 'caz').fuzzy_counts, + (1, 0, 0)) + self.assertEqual(regex.match(r'(?e)(?:cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match(r'(?b)(?:cat){e<=1}', + 'caz').fuzzy_counts, (1, 0, 0)) + + self.assertEqual(regex.match(r'(?:cats){e<=2}', 'c ats').fuzzy_counts, + (1, 1, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats){e<=2}', + 'c ats').fuzzy_counts, (0, 1, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats){e<=2}', + 'c ats').fuzzy_counts, (0, 1, 0)) + + self.assertEqual(regex.match(r'(?:cats){e<=2}', + 'c a ts').fuzzy_counts, (0, 2, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats){e<=2}', + 'c a ts').fuzzy_counts, (0, 2, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats){e<=2}', + 'c a ts').fuzzy_counts, (0, 2, 0)) + + self.assertEqual(regex.match(r'(?:cats){e<=1}', 'c ats').fuzzy_counts, + (0, 1, 0)) + self.assertEqual(regex.match(r'(?e)(?:cats){e<=1}', + 'c ats').fuzzy_counts, (0, 1, 0)) + self.assertEqual(regex.match(r'(?b)(?:cats){e<=1}', + 'c ats').fuzzy_counts, (0, 1, 0)) + + # Hg issue 115: Infinite loop when processing backreferences + self.assertEqual(regex.findall(r'\bof ([a-z]+) of \1\b', + 'To make use of one of these modules'), []) + + # Hg issue 125: Reference to entire match (\g<0>) in + # Pattern.sub() doesn't work as of 2014.09.22 release. + self.assertEqual(regex.sub(r'x', r'\g<0>', 'x'), 'x') + + # Unreported issue: no such builtin as 'ascii' in Python 2. + self.assertEqual(bool(regex.match(r'a', 'a', regex.DEBUG)), True) + + # Hg issue 131: nested sets behaviour + self.assertEqual(regex.findall(r'(?V1)[[b-e]--cd]', 'abcdef'), ['b', + 'e']) + self.assertEqual(regex.findall(r'(?V1)[b-e--cd]', 'abcdef'), ['b', + 'e']) + self.assertEqual(regex.findall(r'(?V1)[[bcde]--cd]', 'abcdef'), ['b', + 'e']) + self.assertEqual(regex.findall(r'(?V1)[bcde--cd]', 'abcdef'), ['b', + 'e']) + + # Hg issue 132: index out of range on null property \p{} + self.assertRaisesRegex(regex.error, '^unknown property at position 4$', + lambda: regex.compile(r'\p{}')) + + # Issue 23692. + self.assertEqual(regex.match('(?:()|(?(1)()|z)){2}(?(2)a|z)', + 'a').group(0, 1, 2), ('a', '', '')) + self.assertEqual(regex.match('(?:()|(?(1)()|z)){0,2}(?(2)a|z)', + 'a').group(0, 1, 2), ('a', '', '')) + + # Hg issue 137: Posix character class :punct: does not seem to be + # supported. + + # Posix compatibility as recommended here: + # http://www.unicode.org/reports/tr18/#Compatibility_Properties + + # Posix in Unicode. + chars = ''.join(chr(c) for c in range(0x10000)) + + self.assertEqual(ascii(''.join(regex.findall(r'''[[:alnum:]]+''', + chars))), ascii(''.join(regex.findall(r'''[\p{Alpha}\p{PosixDigit}]+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:alpha:]]+''', + chars))), ascii(''.join(regex.findall(r'''\p{Alpha}+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:ascii:]]+''', + chars))), ascii(''.join(regex.findall(r'''[\p{InBasicLatin}]+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:blank:]]+''', + chars))), ascii(''.join(regex.findall(r'''[\p{gc=Space_Separator}\t]+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:cntrl:]]+''', + chars))), ascii(''.join(regex.findall(r'''\p{gc=Control}+''', chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:digit:]]+''', + chars))), ascii(''.join(regex.findall(r'''[0-9]+''', chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:graph:]]+''', + chars))), ascii(''.join(regex.findall(r'''[^\p{Space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:lower:]]+''', + chars))), ascii(''.join(regex.findall(r'''\p{Lower}+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:print:]]+''', + chars))), ascii(''.join(regex.findall(r'''(?V1)[\p{Graph}\p{Blank}--\p{Cntrl}]+''', chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:punct:]]+''', + chars))), + ascii(''.join(regex.findall(r'''(?V1)[\p{gc=Punctuation}\p{gc=Symbol}--\p{Alpha}]+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:space:]]+''', + chars))), ascii(''.join(regex.findall(r'''\p{Whitespace}+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:upper:]]+''', + chars))), ascii(''.join(regex.findall(r'''\p{Upper}+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:word:]]+''', + chars))), ascii(''.join(regex.findall(r'''[\p{Alpha}\p{gc=Mark}\p{Digit}\p{gc=Connector_Punctuation}\p{Join_Control}]+''', + chars)))) + self.assertEqual(ascii(''.join(regex.findall(r'''[[:xdigit:]]+''', + chars))), ascii(''.join(regex.findall(r'''[0-9A-Fa-f]+''', + chars)))) + + # Posix in ASCII. + chars = bytes(range(0x100)) + + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:alnum:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)[\p{Alpha}\p{PosixDigit}]+''', + chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:alpha:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)\p{Alpha}+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:ascii:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)[\x00-\x7F]+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:blank:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)[\p{gc=Space_Separator}\t]+''', + chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:cntrl:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)\p{gc=Control}+''', + chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:digit:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)[0-9]+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:graph:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)[^\p{Space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:lower:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)\p{Lower}+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:print:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?aV1)[\p{Graph}\p{Blank}--\p{Cntrl}]+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:punct:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?aV1)[\p{gc=Punctuation}\p{gc=Symbol}--\p{Alpha}]+''', + chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:space:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)\p{Whitespace}+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:upper:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)\p{Upper}+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:word:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)[\p{Alpha}\p{gc=Mark}\p{Digit}\p{gc=Connector_Punctuation}\p{Join_Control}]+''', chars)))) + self.assertEqual(ascii(b''.join(regex.findall(br'''(?a)[[:xdigit:]]+''', + chars))), ascii(b''.join(regex.findall(br'''(?a)[0-9A-Fa-f]+''', chars)))) + + # Hg issue 138: grapheme anchored search not working properly. + self.assertEqual(ascii(regex.search(r'\X$', 'ab\u2103').group()), + ascii('\u2103')) + + # Hg issue 139: Regular expression with multiple wildcards where first + # should match empty string does not always work. + self.assertEqual(regex.search("([^L]*)([^R]*R)", "LtR").groups(), ('', + 'LtR')) + + # Hg issue 140: Replace with REVERSE and groups has unexpected + # behavior. + self.assertEqual(regex.sub(r'(.)', r'x\1y', 'ab'), 'xayxby') + self.assertEqual(regex.sub(r'(?r)(.)', r'x\1y', 'ab'), 'xayxby') + self.assertEqual(regex.subf(r'(.)', 'x{1}y', 'ab'), 'xayxby') + self.assertEqual(regex.subf(r'(?r)(.)', 'x{1}y', 'ab'), 'xayxby') + + # Hg issue 141: Crash on a certain partial match. + self.assertEqual(regex.fullmatch('(a)*abc', 'ab', + partial=True).span(), (0, 2)) + self.assertEqual(regex.fullmatch('(a)*abc', 'ab', + partial=True).partial, True) + + # Hg issue 143: Partial matches have incorrect span if prefix is '.' + # wildcard. + self.assertEqual(regex.search('OXRG', 'OOGOX', partial=True).span(), + (3, 5)) + self.assertEqual(regex.search('.XRG', 'OOGOX', partial=True).span(), + (3, 5)) + self.assertEqual(regex.search('.{1,3}XRG', 'OOGOX', + partial=True).span(), (1, 5)) + + # Hg issue 144: Latest version problem with matching 'R|R'. + self.assertEqual(regex.match('R|R', 'R').span(), (0, 1)) + + # Hg issue 146: Forced-fail (?!) works improperly in conditional. + self.assertEqual(regex.match(r'(.)(?(1)(?!))', 'xy'), None) + + # Groups cleared after failure. + self.assertEqual(regex.findall(r'(y)?(\d)(?(1)\b\B)', 'ax1y2z3b'), + [('', '1'), ('', '2'), ('', '3')]) + self.assertEqual(regex.findall(r'(y)?+(\d)(?(1)\b\B)', 'ax1y2z3b'), + [('', '1'), ('', '2'), ('', '3')]) + + # Hg issue 147: Fuzzy match can return match points beyond buffer end. + self.assertEqual([m.span() for m in regex.finditer(r'(?i)(?:error){e}', + 'regex failure')], [(0, 5), (5, 10), (10, 13), (13, 13)]) + self.assertEqual([m.span() for m in + regex.finditer(r'(?fi)(?:error){e}', 'regex failure')], [(0, 5), (5, + 10), (10, 13), (13, 13)]) + + # Hg issue 150: Have an option for POSIX-compatible longest match of + # alternates. + self.assertEqual(regex.search(r'(?p)\d+(\w(\d*)?|[eE]([+-]\d+))', + '10b12')[0], '10b12') + self.assertEqual(regex.search(r'(?p)\d+(\w(\d*)?|[eE]([+-]\d+))', + '10E+12')[0], '10E+12') + + self.assertEqual(regex.search(r'(?p)(\w|ae|oe|ue|ss)', 'ae')[0], 'ae') + self.assertEqual(regex.search(r'(?p)one(self)?(selfsufficient)?', + 'oneselfsufficient')[0], 'oneselfsufficient') + + # Hg issue 151: Request: \K. + self.assertEqual(regex.search(r'(ab\Kcd)', 'abcd').group(0, 1), ('cd', + 'abcd')) + self.assertEqual(regex.findall(r'\w\w\K\w\w', 'abcdefgh'), ['cd', + 'gh']) + self.assertEqual(regex.findall(r'(\w\w\K\w\w)', 'abcdefgh'), ['abcd', + 'efgh']) + + self.assertEqual(regex.search(r'(?r)(ab\Kcd)', 'abcd').group(0, 1), + ('ab', 'abcd')) + self.assertEqual(regex.findall(r'(?r)\w\w\K\w\w', 'abcdefgh'), ['ef', + 'ab']) + self.assertEqual(regex.findall(r'(?r)(\w\w\K\w\w)', 'abcdefgh'), + ['efgh', 'abcd']) + + # Hg issue 152: Request: Request: (?(DEFINE)...). + self.assertEqual(regex.search(r'(?(DEFINE)(?\d+)(?\w+))(?&quant) (?&item)', + '5 elephants')[0], '5 elephants') + + self.assertEqual(regex.search(r'(?&routine)(?(DEFINE)(?.))', 'a').group('routine'), None) + self.assertEqual(regex.search(r'(?&routine)(?(DEFINE)(?.))', 'a').captures('routine'), ['a']) + + # Hg issue 153: Request: (*SKIP). + self.assertEqual(regex.search(r'12(*FAIL)|3', '123')[0], '3') + self.assertEqual(regex.search(r'(?r)12(*FAIL)|3', '123')[0], '3') + + self.assertEqual(regex.search(r'\d+(*PRUNE)\d', '123'), None) + self.assertEqual(regex.search(r'\d+(?=(*PRUNE))\d', '123')[0], '123') + self.assertEqual(regex.search(r'\d+(*PRUNE)bcd|[3d]', '123bcd')[0], + '123bcd') + self.assertEqual(regex.search(r'\d+(*PRUNE)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d+?(*PRUNE)bcd|[3d]', '123bcd')[0], + '3bcd') + self.assertEqual(regex.search(r'\d+?(*PRUNE)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d++(?<=3(*PRUNE))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'\d++(?<=3(*PRUNE))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=(*PRUNE)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=2(*PRUNE)3)zzd|[3d]$', + '124zzd')[0], 'd') + + self.assertEqual(regex.search(r'(?r)\d(*PRUNE)\d+', '123'), None) + self.assertEqual(regex.search(r'(?r)\d(?<=(*PRUNE))\d+', '123')[0], + '123') + self.assertEqual(regex.search(r'(?r)\d+(*PRUNE)bcd|[3d]', + '123bcd')[0], '123bcd') + self.assertEqual(regex.search(r'(?r)\d+(*PRUNE)bcd|[3d]', + '123zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*PRUNE))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*PRUNE))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=(*PRUNE)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=2(*PRUNE)3)zzd|[3d]$', + '124zzd')[0], 'd') + + self.assertEqual(regex.search(r'\d+(*SKIP)bcd|[3d]', '123bcd')[0], + '123bcd') + self.assertEqual(regex.search(r'\d+(*SKIP)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d+?(*SKIP)bcd|[3d]', '123bcd')[0], + '3bcd') + self.assertEqual(regex.search(r'\d+?(*SKIP)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'\d++(?<=3(*SKIP))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'\d++(?<=3(*SKIP))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=(*SKIP)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'\d++(?<=2(*SKIP)3)zzd|[3d]$', + '124zzd')[0], 'd') + + self.assertEqual(regex.search(r'(?r)\d+(*SKIP)bcd|[3d]', '123bcd')[0], + '123bcd') + self.assertEqual(regex.search(r'(?r)\d+(*SKIP)bcd|[3d]', '123zzd')[0], + 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*SKIP))zzd|[4d]$', + '123zzd')[0], '123zzd') + self.assertEqual(regex.search(r'(?r)\d++(?<=3(*SKIP))zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=(*SKIP)3)zzd|[4d]$', + '124zzd')[0], 'd') + self.assertEqual(regex.search(r'(?r)\d++(?<=2(*SKIP)3)zzd|[3d]$', + '124zzd')[0], 'd') + + # Hg issue 154: Segmentation fault 11 when working with an atomic group + text = """June 30, December 31, 2013 2012 +some words follow: +more words and numbers 1,234,567 9,876,542 +more words and numbers 1,234,567 9,876,542""" + self.assertEqual(len(regex.findall(r'(?2014|2013 ?2012)', text)), 1) + + # Hg issue 156: regression on atomic grouping + self.assertEqual(regex.match('1(?>2)', '12').span(), (0, 2)) + + # Hg issue 157: regression: segfault on complex lookaround + self.assertEqual(regex.match(r'(?V1w)(?=(?=[^A-Z]*+[A-Z])(?=[^a-z]*+[a-z]))(?=\D*+\d)(?=\p{Alphanumeric}*+\P{Alphanumeric})\A(?s:.){8,255}+\Z', + 'AAaa11!!')[0], 'AAaa11!!') + + # Hg issue 158: Group issue with (?(DEFINE)...) + TEST_REGEX = regex.compile(r'''(?smx) +(?(DEFINE) + (? + ^,[^,]+, + ) +) + +# Group 2 is defined on this line +^,([^,]+), + +(?:(?!(?&subcat)[\r\n]+(?&subcat)).)+ +''') + + TEST_DATA = ''' +,Cat 1, +,Brand 1, +some +thing +,Brand 2, +other +things +,Cat 2, +,Brand, +Some +thing +''' + + self.assertEqual([m.span(1, 2) for m in + TEST_REGEX.finditer(TEST_DATA)], [((-1, -1), (2, 7)), ((-1, -1), (54, + 59))]) + + # Hg issue 161: Unexpected fuzzy match results + self.assertEqual(regex.search('(abcdefgh){e}', + '******abcdefghijklmnopqrtuvwxyz', regex.BESTMATCH).span(), (6, 14)) + self.assertEqual(regex.search('(abcdefghi){e}', + '******abcdefghijklmnopqrtuvwxyz', regex.BESTMATCH).span(), (6, 15)) + + # Hg issue 163: allow lookarounds in conditionals. + self.assertEqual(regex.match(r'(?:(?=\d)\d+\b|\w+)', '123abc').span(), + (0, 6)) + self.assertEqual(regex.match(r'(?(?=\d)\d+\b|\w+)', '123abc'), None) + self.assertEqual(regex.search(r'(?(?<=love\s)you|(?<=hate\s)her)', + "I love you").span(), (7, 10)) + self.assertEqual(regex.findall(r'(?(?<=love\s)you|(?<=hate\s)her)', + "I love you but I don't hate her either"), ['you', 'her']) + + # Hg issue 180: bug of POSIX matching. + self.assertEqual(regex.search(r'(?p)a*(.*?)', 'aaabbb').group(0, 1), + ('aaabbb', 'bbb')) + self.assertEqual(regex.search(r'(?p)a*(.*)', 'aaabbb').group(0, 1), + ('aaabbb', 'bbb')) + self.assertEqual(regex.sub(r'(?p)a*(.*?)', r'\1', 'aaabbb'), 'bbb') + self.assertEqual(regex.sub(r'(?p)a*(.*)', r'\1', 'aaabbb'), 'bbb') + + # Hg issue 192: Named lists reverse matching doesn't work with + # IGNORECASE and V1 + self.assertEqual(regex.match(r'(?irV0)\L', '21', kw=['1']).span(), + (1, 2)) + self.assertEqual(regex.match(r'(?irV1)\L', '21', kw=['1']).span(), + (1, 2)) + + # Hg issue 193: Alternation and .REVERSE flag. + self.assertEqual(regex.search('a|b', '111a222').span(), (3, 4)) + self.assertEqual(regex.search('(?r)a|b', '111a222').span(), (3, 4)) + + # Hg issue 194: .FULLCASE and Backreference + self.assertEqual(regex.search(r'(?if)<(CLI)><\1>', + '').span(), (0, 10)) + self.assertEqual(regex.search(r'(?if)<(CLI)><\1>', + '').span(), (0, 10)) + self.assertEqual(regex.search(r'(?ifr)<\1><(CLI)>', + '').span(), (0, 10)) + + # Hg issue 195: Pickle (or otherwise serial) the compiled regex + r = regex.compile(r'\L', options=['foo', 'bar']) + p = pickle.dumps(r) + r = pickle.loads(p) + self.assertEqual(r.match('foo').span(), (0, 3)) + + # Hg issue 196: Fuzzy matching on repeated regex not working as + # expected + self.assertEqual(regex.match('(x{6}){e<=1}', 'xxxxxx', + flags=regex.BESTMATCH).span(), (0, 6)) + self.assertEqual(regex.match('(x{6}){e<=1}', 'xxxxx', + flags=regex.BESTMATCH).span(), (0, 5)) + self.assertEqual(regex.match('(x{6}){e<=1}', 'x', + flags=regex.BESTMATCH), None) + self.assertEqual(regex.match('(?r)(x{6}){e<=1}', 'xxxxxx', + flags=regex.BESTMATCH).span(), (0, 6)) + self.assertEqual(regex.match('(?r)(x{6}){e<=1}', 'xxxxx', + flags=regex.BESTMATCH).span(), (0, 5)) + self.assertEqual(regex.match('(?r)(x{6}){e<=1}', 'x', + flags=regex.BESTMATCH), None) + + # Hg issue 197: ValueError in regex.compile + self.assertRaises(regex.error, lambda: + regex.compile(b'00000\\0\\00\\^\50\\00\\U05000000')) + + # Hg issue 198: ValueError in regex.compile + self.assertRaises(regex.error, lambda: regex.compile(b"{e', '22', aa=['121', + '22'])), True) + self.assertEqual(bool(regex.search(r'(?ri)\L', '22', aa=['121', + '22'])), True) + self.assertEqual(bool(regex.search(r'(?fi)\L', '22', aa=['121', + '22'])), True) + self.assertEqual(bool(regex.search(r'(?fri)\L', '22', aa=['121', + '22'])), True) + + # Hg issue 208: Named list, (?ri) flags, Backreference + self.assertEqual(regex.search(r'(?r)\1dog..(?<=(\L))$', 'ccdogcc', + aa=['bcb', 'cc']). span(), (0, 7)) + self.assertEqual(regex.search(r'(?ir)\1dog..(?<=(\L))$', + 'ccdogcc', aa=['bcb', 'cc']). span(), (0, 7)) + + # Hg issue 210: Fuzzy matching and Backreference + self.assertEqual(regex.search(r'(2)(?:\1{5}){e<=1}', + '3222212').span(), (1, 7)) + self.assertEqual(regex.search(r'(\d)(?:\1{5}){e<=1}', + '3222212').span(), (1, 7)) + + # Hg issue 211: Segmentation fault with recursive matches and atomic + # groups + self.assertEqual(regex.match(r'''\A(?P(?>\((?&whole)\)|[+\-]))\Z''', + '((-))').span(), (0, 5)) + self.assertEqual(regex.match(r'''\A(?P(?>\((?&whole)\)|[+\-]))\Z''', + '((-)+)'), None) + + # Hg issue 212: Unexpected matching difference with .*? between re and + # regex + self.assertEqual(regex.match(r"x.*? (.).*\1(.*)\1", + 'x |y| z|').span(), (0, 9)) + self.assertEqual(regex.match(r"\.sr (.*?) (.)(.*)\2(.*)\2(.*)", + r'.sr h |||').span(), (0, 35)) + + # Hg issue 213: Segmentation Fault + a = '"\\xF9\\x80\\xAEqdz\\x95L\\xA7\\x89[\\xFE \\x91)\\xF9]\\xDB\'\\x99\\x09=\\x00\\xFD\\x98\\x22\\xDD\\xF1\\xB6\\xC3 Z\\xB6gv\\xA5x\\x93P\\xE1r\\x14\\x8Cv\\x0C\\xC0w\\x15r\\xFFc%" ' + py_regex_pattern = r'''(?P((?>(?"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``)))) (?P((?>(?"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``))))''' + self.assertEqual(bool(regex.search(py_regex_pattern, a)), False) + + # Hg Issue 216: Invalid match when using negative lookbehind and pipe + self.assertEqual(bool(regex.match('foo(?<=foo)', 'foo')), True) + self.assertEqual(bool(regex.match('foo(?.*\!\w*\:.*)|(?P.*))', + '!')), False) + + # Hg issue 220: Misbehavior of group capture with OR operand + self.assertEqual(regex.match(r'\w*(ea)\w*|\w*e(?!a)\w*', + 'easier').groups(), ('ea', )) + + # Hg issue 225: BESTMATCH in fuzzy match not working + self.assertEqual(regex.search('(^1234$){i,d}', '12234', + regex.BESTMATCH).span(), (0, 5)) + self.assertEqual(regex.search('(^1234$){i,d}', '12234', + regex.BESTMATCH).fuzzy_counts, (0, 1, 0)) + + self.assertEqual(regex.search('(^1234$){s,i,d}', '12234', + regex.BESTMATCH).span(), (0, 5)) + self.assertEqual(regex.search('(^1234$){s,i,d}', '12234', + regex.BESTMATCH).fuzzy_counts, (0, 1, 0)) + + # Hg issue 226: Error matching at start of string + self.assertEqual(regex.search('(^123$){s,i,d}', 'xxxxxxxx123', + regex.BESTMATCH).span(), (0, 11)) + self.assertEqual(regex.search('(^123$){s,i,d}', 'xxxxxxxx123', + regex.BESTMATCH).fuzzy_counts, (0, 8, 0)) + + # Hg issue 227: Incorrect behavior for ? operator with UNICODE + + # IGNORECASE + self.assertEqual(regex.search(r'a?yz', 'xxxxyz', flags=regex.FULLCASE | + regex.IGNORECASE).span(), (4, 6)) + + # Hg issue 230: Is it a bug of (?(DEFINE)...) + self.assertEqual(regex.findall(r'(?:(?![a-d]).)+', 'abcdefgh'), + ['efgh']) + self.assertEqual(regex.findall(r'''(?(DEFINE)(?P(?:(?![a-d]).)))(?&mydef)+''', + 'abcdefgh'), ['efgh']) + + # Hg issue 238: Not fully re backward compatible + self.assertEqual(regex.findall(r'((\w{1,3})(\.{2,10})){1,3}', + '"Erm....yes. T..T...Thank you for that."'), [('Erm....', 'Erm', + '....'), ('T...', 'T', '...')]) + self.assertEqual(regex.findall(r'((\w{1,3})(\.{2,10})){3}', + '"Erm....yes. T..T...Thank you for that."'), []) + self.assertEqual(regex.findall(r'((\w{1,3})(\.{2,10})){2}', + '"Erm....yes. T..T...Thank you for that."'), [('T...', 'T', '...')]) + self.assertEqual(regex.findall(r'((\w{1,3})(\.{2,10})){1}', + '"Erm....yes. T..T...Thank you for that."'), [('Erm....', 'Erm', + '....'), ('T..', 'T', '..'), ('T...', 'T', '...')]) + + # Hg issue 247: Unexpected result with fuzzy matching and lookahead + # expression + self.assertEqual(regex.search(r'(?:ESTONIA(?!\w)){e<=1}', + 'ESTONIAN WORKERS').group(), 'ESTONIAN') + self.assertEqual(regex.search(r'(?:ESTONIA(?=\W)){e<=1}', + 'ESTONIAN WORKERS').group(), 'ESTONIAN') + + self.assertEqual(regex.search(r'(?:(?.))(?&func)', + 'abc').groups(), (None, )) + self.assertEqual(regex.search(r'(?(DEFINE)(?.))(?&func)', + 'abc').groupdict(), {'func': None}) + self.assertEqual(regex.search(r'(?(DEFINE)(?.))(?&func)', + 'abc').capturesdict(), {'func': ['a']}) + + self.assertEqual(regex.search(r'(?(DEFINE)(?.))(?=(?&func))', + 'abc').groups(), (None, )) + self.assertEqual(regex.search(r'(?(DEFINE)(?.))(?=(?&func))', + 'abc').groupdict(), {'func': None}) + self.assertEqual(regex.search(r'(?(DEFINE)(?.))(?=(?&func))', + 'abc').capturesdict(), {'func': ['a']}) + + self.assertEqual(regex.search(r'(?(DEFINE)(?.)).(?<=(?&func))', + 'abc').groups(), (None, )) + self.assertEqual(regex.search(r'(?(DEFINE)(?.)).(?<=(?&func))', + 'abc').groupdict(), {'func': None}) + self.assertEqual(regex.search(r'(?(DEFINE)(?.)).(?<=(?&func))', + 'abc').capturesdict(), {'func': ['a']}) + + # Hg issue 271: Comment logic different between Re and Regex + self.assertEqual(bool(regex.match(r'ab(?#comment\))cd', 'abcd')), True) + + # Hg issue 276: Partial Matches yield incorrect matches and bounds + self.assertEqual(regex.search(r'[a-z]+ [a-z]*?:', 'foo bar', + partial=True).span(), (0, 7)) + self.assertEqual(regex.search(r'(?r):[a-z]*? [a-z]+', 'foo bar', + partial=True).span(), (0, 7)) + + # Hg issue 291: Include Script Extensions as a supported Unicode property + self.assertEqual(bool(regex.match(r'(?u)\p{Script:Beng}', + '\u09EF')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{Script:Bengali}', + '\u09EF')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{Script_Extensions:Bengali}', + '\u09EF')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{Script_Extensions:Beng}', + '\u09EF')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{Script_Extensions:Cakm}', + '\u09EF')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{Script_Extensions:Sylo}', + '\u09EF')), True) + + # Hg issue #293: scx (Script Extensions) property currently matches + # incorrectly + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Latin}', 'P')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Ahom}', 'P')), False) + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Common}', '4')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Caucasian_Albanian}', '4')), + False) + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Arabic}', '\u062A')), True) + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Balinese}', '\u062A')), + False) + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Devanagari}', '\u091C')), + True) + self.assertEqual(bool(regex.match(r'(?u)\p{scx:Batak}', '\u091C')), False) + + # Hg issue 296: Group references are not taken into account when group is reporting the last match + self.assertEqual(regex.fullmatch('(?P.)*(?&x)', 'abc').captures('x'), + ['a', 'b', 'c']) + self.assertEqual(regex.fullmatch('(?P.)*(?&x)', 'abc').group('x'), + 'b') + + self.assertEqual(regex.fullmatch('(?P.)(?P.)(?P.)', + 'abc').captures('x'), ['a', 'b', 'c']) + self.assertEqual(regex.fullmatch('(?P.)(?P.)(?P.)', + 'abc').group('x'), 'c') + + # Hg issue 299: Partial gives misleading results with "open ended" regexp + self.assertEqual(regex.match('(?:ab)*', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)*', 'abab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)*?', '', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)*+', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)*+', 'abab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)+', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)+', 'abab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)+?', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)++', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?:ab)++', 'abab', partial=True).partial, + False) + + self.assertEqual(regex.match('(?r)(?:ab)*', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)*', 'abab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)*?', '', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)*+', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)*+', 'abab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)+', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)+', 'abab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)+?', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)++', 'ab', partial=True).partial, + False) + self.assertEqual(regex.match('(?r)(?:ab)++', 'abab', partial=True).partial, + False) + + self.assertEqual(regex.match('a*', '', partial=True).partial, False) + self.assertEqual(regex.match('a*?', '', partial=True).partial, False) + self.assertEqual(regex.match('a*+', '', partial=True).partial, False) + self.assertEqual(regex.match('a+', '', partial=True).partial, True) + self.assertEqual(regex.match('a+?', '', partial=True).partial, True) + self.assertEqual(regex.match('a++', '', partial=True).partial, True) + self.assertEqual(regex.match('a+', 'a', partial=True).partial, False) + self.assertEqual(regex.match('a+?', 'a', partial=True).partial, False) + self.assertEqual(regex.match('a++', 'a', partial=True).partial, False) + + self.assertEqual(regex.match('(?r)a*', '', partial=True).partial, False) + self.assertEqual(regex.match('(?r)a*?', '', partial=True).partial, False) + self.assertEqual(regex.match('(?r)a*+', '', partial=True).partial, False) + self.assertEqual(regex.match('(?r)a+', '', partial=True).partial, True) + self.assertEqual(regex.match('(?r)a+?', '', partial=True).partial, True) + self.assertEqual(regex.match('(?r)a++', '', partial=True).partial, True) + self.assertEqual(regex.match('(?r)a+', 'a', partial=True).partial, False) + self.assertEqual(regex.match('(?r)a+?', 'a', partial=True).partial, False) + self.assertEqual(regex.match('(?r)a++', 'a', partial=True).partial, False) + + self.assertEqual(regex.match(r"(?:\s*\w+'*)+", 'whatever', partial=True).partial, + False) + + # Hg issue 300: segmentation fault + pattern = ('(?PGGCGTCACACTTTGCTATGCCATAGCAT[AG]TTTATCCATAAGA' + 'TTAGCGGATCCTACCTGACGCTTTTTATCGCAACTCTCTACTGTTTCTCCATAACAGAACATATTGA' + 'CTATCCGGTATTACCCGGCATGACAGGAGTAAAA){e<=1}' + '(?P[ACGT]{1059}){e<=2}' + '(?PTAATCGTCTTGTTTGATACACAAGGGTCGCATCTGCGGCCCTTTTGCTTTTTTAAG' + 'TTGTAAGGATATGCCATTCTAGA){e<=0}' + '(?P[ACGT]{18}){e<=0}' + '(?PAGATCGG[CT]AGAGCGTCGTGTAGGGAAAGAGTGTGG){e<=1}') + + text = ('GCACGGCGTCACACTTTGCTATGCCATAGCATATTTATCCATAAGATTAGCGGATCCTACC' + 'TGACGCTTTTTATCGCAACTCTCTACTGTTTCTCCATAACAGAACATATTGACTATCCGGTATTACC' + 'CGGCATGACAGGAGTAAAAATGGCTATCGACGAAAACAAACAGAAAGCGTTGGCGGCAGCACTGGGC' + 'CAGATTGAGAAACAATTTGGTAAAGGCTCCATCATGCGCCTGGGTGAAGACCGTTCCATGGATGTGG' + 'AAACCATCTCTACCGGTTCGCTTTCACTGGATATCGCGCTTGGGGCAGGTGGTCTGCCGATGGGCCG' + 'TATCGTCGAAATCTACGGACCGGAATCTTCCGGTAAAACCACGCTGACGCTGCAGGTGATCGCCGCA' + 'GCGCAGCGTGAAGGTAAAACCTGTGCGTTTATCGATGCTGAACACGCGCTGGACCCAATCTACGCAC' + 'GTAAACTGGGCGTCGATATCGACAACCTGCTGTGCTCCCAGCCGGACACCGGCGAGCAGGCACTGGA' + 'AATCTGTGACGCCCTGGCGCGTTCTGGCGCAGTAGACGTTATCGTCGTTGACTCCGTGGCGGCACTG' + 'ACGCCGAAAGCGGAAATCGAAGGCGAAATCGGCGACTCTCATATGGGCCTTGCGGCACGTATGATGA' + 'GCCAGGCGATGCGTAAGCTGGCGGGTAACCTGAAGCAGTCCAACACGCTGCTGATCTTCATCAACCC' + 'CATCCGTATGAAAATTGGTGTGATGTTCGGCAACCCGGAAACCACTTACCGGTGGTAACGCGCTGAA' + 'ATTCTACGCCTCTGTTCGTCTCGACATCCGTTAAATCGGCGCGGTGAAAGAGGGCGAAAACGTGGTG' + 'GGTAGCGAAACCCGCGTGAAAGTGGTGAAGAACAAAATCGCTGCGCCGTTTAAACAGGCTGAATTCC' + 'AGATCCTCTACGGCGAAGGTATCAACTTCTACCCCGAACTGGTTGACCTGGGCGTAAAAGAGAAGCT' + 'GATCGAGAAAGCAGGCGCGTGGTACAGCTACAAAGGTGAGAAGATCGGTCAGGGTAAAGCGAATGCG' + 'ACTGCCTGGCTGAAATTTAACCCGGAAACCGCGAAAGAGATCGAGTGAAAAGTACGTGAGTTGCTGC' + 'TGAGCAACCCGAACTCAACGCCGGATTTCTCTGTAGATGATAGCGAAGGCGTAGCAGAAACTAACGA' + 'AGATTTTTAATCGTCTTGTTTGATACACAAGGGTCGCATCTGCGGCCCTTTTGCTTTTTTAAGTTGT' + 'AAGGATATGCCATTCTAGACAGTTAACACACCAACAAAGATCGGTAGAGCGTCGTGTAGGGAAAGAG' + 'TGTGGTACC') + + m = regex.search(pattern, text, flags=regex.BESTMATCH) + self.assertEqual(m.fuzzy_counts, (0, 1, 0)) + self.assertEqual(m.fuzzy_changes, ([], [1206], [])) + + # Hg issue 306: Fuzzy match parameters not respecting quantifier scope + self.assertEqual(regex.search(r'(?e)(dogf(((oo){e<1})|((00){e<1}))d){e<2}', + 'dogfood').fuzzy_counts, (0, 0, 0)) + self.assertEqual(regex.search(r'(?e)(dogf(((oo){e<1})|((00){e<1}))d){e<2}', + 'dogfoot').fuzzy_counts, (1, 0, 0)) + + # Hg issue 312: \X not matching graphemes with zero-width-joins + self.assertEqual(regex.findall(r'\X', + '\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466'), + ['\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466']) + + # Hg issue 320: Abnormal performance + self.assertEqual(bool(regex.search(r'(?=a)a', 'a')), True) + self.assertEqual(bool(regex.search(r'(?!b)a', 'a')), True) + + # Hg issue 327: .fullmatch() causes MemoryError + self.assertEqual(regex.fullmatch(r'((\d)*?)*?', '123').span(), (0, 3)) + + # Hg issue 329: Wrong group matches when question mark quantifier is used within a look behind + self.assertEqual(regex.search(r'''(?(DEFINE)(?(?THIS_SHOULD_NOT_MATCHx?)|(?right))).*(?<=(?&mydef).*)''', + 'x right').capturesdict(), {'mydef': ['right'], 'wrong': [], 'right': + ['right']}) + + # Hg issue 338: specifying allowed characters when fuzzy-matching + self.assertEqual(bool(regex.match(r'(?:cat){e<=1:[u]}', 'cut')), True) + self.assertEqual(bool(regex.match(r'(?:cat){e<=1:u}', 'cut')), True) + + # Hg issue 353: fuzzy changes negative indexes + self.assertEqual(regex.search(r'(?be)(AGTGTTCCCCGCGCCAGCGGGGATAAACCG){s<=5,i<=5,d<=5,s+i+d<=10}', + 'TTCCCCGCGCCAGCGGGGATAAACCG').fuzzy_changes, ([], [], [0, 1, 3, 5])) + + # Git issue 364: Contradictory values in fuzzy_counts and fuzzy_changes + self.assertEqual(regex.match(r'(?:bc){e}', 'c').fuzzy_counts, (1, 0, + 1)) + self.assertEqual(regex.match(r'(?:bc){e}', 'c').fuzzy_changes, ([0], + [], [1])) + self.assertEqual(regex.match(r'(?e)(?:bc){e}', 'c').fuzzy_counts, (0, + 0, 1)) + self.assertEqual(regex.match(r'(?e)(?:bc){e}', 'c').fuzzy_changes, + ([], [], [0])) + self.assertEqual(regex.match(r'(?b)(?:bc){e}', 'c').fuzzy_counts, (0, + 0, 1)) + self.assertEqual(regex.match(r'(?b)(?:bc){e}', 'c').fuzzy_changes, + ([], [], [0])) + + # Git issue 370: Confusions about Fuzzy matching behavior + self.assertEqual(regex.match('(?e)(?:^(\\$ )?\\d{1,3}(,\\d{3})*(\\.\\d{2})$){e}', + '$ 10,112.111.12').fuzzy_counts, (6, 0, 5)) + self.assertEqual(regex.match('(?e)(?:^(\\$ )?\\d{1,3}(,\\d{3})*(\\.\\d{2})$){s<=1}', + '$ 10,112.111.12').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match('(?e)(?:^(\\$ )?\\d{1,3}(,\\d{3})*(\\.\\d{2})$){s<=1,i<=1,d<=1}', + '$ 10,112.111.12').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.match('(?e)(?:^(\\$ )?\\d{1,3}(,\\d{3})*(\\.\\d{2})$){s<=3}', + '$ 10,1a2.111.12').fuzzy_counts, (2, 0, 0)) + self.assertEqual(regex.match('(?e)(?:^(\\$ )?\\d{1,3}(,\\d{3})*(\\.\\d{2})$){s<=2}', + '$ 10,1a2.111.12').fuzzy_counts, (2, 0, 0)) + + self.assertEqual(regex.fullmatch(r'(?e)(?:0?,0(?:,0)?){s<=1,d<=1}', + ',0;0').fuzzy_counts, (1, 0, 0)) + self.assertEqual(regex.fullmatch(r'(?e)(?:0??,0(?:,0)?){s<=1,d<=1}', + ',0;0').fuzzy_counts, (1, 0, 0)) + + # Git issue 371: Specifying character set when fuzzy-matching allows characters not in the set + self.assertEqual(regex.search(r"\b(?e)(?:\d{6,20}){i<=5:[\-\\\/]}\b", + "cat dog starting at 00:01132.000. hello world"), None) + + # Git issue 385: Comments in expressions + self.assertEqual(bool(regex.compile('(?#)')), True) + self.assertEqual(bool(regex.compile('(?x)(?#)')), True) + + # Git issue 394: Unexpected behaviour in fuzzy matching with limited character set with IGNORECASE flag + self.assertEqual(regex.findall(r'(\d+){i<=2:[ab]}', '123X4Y5'), + ['123', '4', '5']) + self.assertEqual(regex.findall(r'(?i)(\d+){i<=2:[ab]}', '123X4Y5'), + ['123', '4', '5']) + + # Git issue 403: Fuzzy matching with wrong distance (unnecessary substitutions) + self.assertEqual(regex.match(r'^(test){e<=5}$', 'terstin', + flags=regex.B).fuzzy_counts, (0, 3, 0)) + + # Git issue 408: regex fails with a quantified backreference but succeeds with repeated backref + self.assertEqual(bool(regex.match(r"(?:(x*)\1\1\1)*x$", "x" * 5)), True) + self.assertEqual(bool(regex.match(r"(?:(x*)\1{3})*x$", "x" * 5)), True) + + # Git issue 415: Fuzzy character restrictions don't apply to insertions at "right edge" + self.assertEqual(regex.match(r't(?:es){s<=1:\d}t', 'te5t').group(), + 'te5t') + self.assertEqual(regex.match(r't(?:es){s<=1:\d}t', 'tezt'), None) + self.assertEqual(regex.match(r't(?:es){i<=1:\d}t', 'tes5t').group(), + 'tes5t') + self.assertEqual(regex.match(r't(?:es){i<=1:\d}t', 'teszt'), None) + self.assertEqual(regex.match(r't(?:es){i<=1:\d}t', + 'tes5t').fuzzy_changes, ([], [3], [])) + self.assertEqual(regex.match(r't(es){i<=1,0.*)(?PCTTCC){e<=1}(?P([ACGT]){4,6})(?PCAATACCGACTCCTCACTGTGT){e<=2}(?P([ACGT]){0,6}$)' + + m = regex.match(pattern, sequence, flags=regex.BESTMATCH) + self.assertEqual(m.span(), (0, 50)) + self.assertEqual(m.groupdict(), {'insert': 'TTCAGACGTGTGCT', 'anchor': 'CTTCC', 'umi': 'GATCT', 'sid': 'CAATACCGACTCCTCACTGTGT', 'end': 'GTCT'}) + + m = regex.match(pattern, sequence, flags=regex.ENHANCEMATCH) + self.assertEqual(m.span(), (0, 50)) + self.assertEqual(m.groupdict(), {'insert': 'TTCAGACGTGTGCT', 'anchor': 'CTTCC', 'umi': 'GATCT', 'sid': 'CAATACCGACTCCTCACTGTGT', 'end': 'GTCT'}) + + # Git issue 433: Disagreement between fuzzy_counts and fuzzy_changes + pattern = r'(?P.*)(?PAACACTGG){e<=1}(?P([AT][CG]){5}){e<=2}(?PGTAACCGAAG){e<=2}(?P([ACGT]){0,6}$)' + + sequence = 'GGAAAACACTGGTCTCAGTCTCGTAACCGAAGTGGTCG' + m = regex.match(pattern, sequence, flags=regex.BESTMATCH) + self.assertEqual(m.fuzzy_counts, (0, 0, 0)) + self.assertEqual(m.fuzzy_changes, ([], [], [])) + + sequence = 'GGAAAACACTGGTCTCAGTCTCGTCCCCGAAGTGGTCG' + m = regex.match(pattern, sequence, flags=regex.BESTMATCH) + self.assertEqual(m.fuzzy_counts, (2, 0, 0)) + self.assertEqual(m.fuzzy_changes, ([24, 25], [], [])) + + # Git issue 439: Unmatched groups: sub vs subf + self.assertEqual(regex.sub(r'(test1)|(test2)', r'matched: \1\2', 'test1'), 'matched: test1') + self.assertEqual(regex.subf(r'(test1)|(test2)', r'matched: {1}{2}', 'test1'), 'matched: test1') + self.assertEqual(regex.search(r'(test1)|(test2)', 'matched: test1').expand(r'matched: \1\2'), 'matched: test1'), + self.assertEqual(regex.search(r'(test1)|(test2)', 'matched: test1').expandf(r'matched: {1}{2}'), 'matched: test1') + + # Git issue 442: Fuzzy regex matching doesn't seem to test insertions correctly + self.assertEqual(regex.search(r"(?:\bha\b){i:[ ]}", "having"), None) + self.assertEqual(regex.search(r"(?:\bha\b){i:[ ]}", "having", flags=regex.I), None) + + # Git issue 467: Scoped inline flags 'a', 'u' and 'L' affect global flags + self.assertEqual(regex.match(r'(?a:\w)\w', 'd\N{CYRILLIC SMALL LETTER ZHE}').span(), (0, 2)) + self.assertEqual(regex.match(r'(?a:\w)(?u:\w)', 'd\N{CYRILLIC SMALL LETTER ZHE}').span(), (0, 2)) + + # Git issue 473: Emoji classified as letter + self.assertEqual(regex.match(r'^\p{LC}+$', '\N{SMILING CAT FACE WITH OPEN MOUTH}'), None) + self.assertEqual(regex.match(r'^\p{So}+$', '\N{SMILING CAT FACE WITH OPEN MOUTH}').span(), (0, 1)) + + # Git issue 474: regex has no equivalent to `re.Match.groups()` for captures + self.assertEqual(regex.match(r'(.)+', 'abc').allcaptures(), (['abc'], ['a', 'b', 'c'])) + self.assertEqual(regex.match(r'(.)+', 'abc').allspans(), ([(0, 3)], [(0, 1), (1, 2), (2, 3)])) + + # Git issue 477: \v for vertical spacing + self.assertEqual(bool(regex.fullmatch(r'\p{HorizSpace}+', '\t \xA0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F\u205F\u3000')), True) + self.assertEqual(bool(regex.fullmatch(r'\p{VertSpace}+', '\n\v\f\r\x85\u2028\u2029')), True) + + # Git issue 479: Segmentation fault when using conditional pattern + self.assertEqual(regex.match(r'(?(?<=A)|(?(?![^B])C|D))', 'A'), None) + self.assertEqual(regex.search(r'(?(?<=A)|(?(?![^B])C|D))', 'A').span(), (1, 1)) + + # Git issue 494: Backtracking failure matching regex ^a?(a?)b?c\1$ against string abca + self.assertEqual(regex.search(r"^a?(a?)b?c\1$", "abca").span(), (0, 4)) + + # Git issue 498: Conditional negative lookahead inside positive lookahead fails to match + self.assertEqual(regex.match(r'(?(?=a).|..)', 'ab').span(), (0, 1)) + self.assertEqual(regex.match(r'(?(?=b).|..)', 'ab').span(), (0, 2)) + self.assertEqual(regex.match(r'(?(?!a).|..)', 'ab').span(), (0, 2)) + self.assertEqual(regex.match(r'(?(?!b).|..)', 'ab').span(), (0, 1)) + + # Git issue 525: segfault when fuzzy matching empty list + self.assertEqual(regex.match(r"(\L){e<=5}", "blah", foo=[]).span(), (0, 0)) + + # Git issue 527: `VERBOSE`/`X` flag breaks `\N` escapes + self.assertEqual(regex.compile(r'\N{LATIN SMALL LETTER A}').match('a').span(), (0, 1)) + self.assertEqual(regex.compile(r'\N{LATIN SMALL LETTER A}', flags=regex.X).match('a').span(), (0, 1)) + + # Git issue 539: Bug: Partial matching fails on a simple example + self.assertEqual(regex.match(r"[^/]*b/ccc", "b/ccc", partial=True).span(), (0, 5)) + self.assertEqual(regex.match(r"[^/]*b/ccc", "b/ccb", partial=True), None) + self.assertEqual(regex.match(r"[^/]*b/ccc", "b/cc", partial=True).span(), (0, 4)) + self.assertEqual(regex.match(r"[^/]*b/xyz", "b/xy", partial=True).span(), (0, 4)) + self.assertEqual(regex.match(r"[^/]*b/xyz", "b/yz", partial=True), None) + + self.assertEqual(regex.match(r"(?i)[^/]*b/ccc", "b/ccc", partial=True).span(), (0, 5)) + self.assertEqual(regex.match(r"(?i)[^/]*b/ccc", "b/ccb", partial=True), None) + self.assertEqual(regex.match(r"(?i)[^/]*b/ccc", "b/cc", partial=True).span(), (0, 4)) + self.assertEqual(regex.match(r"(?i)[^/]*b/xyz", "b/xy", partial=True).span(), (0, 4)) + self.assertEqual(regex.match(r"(?i)[^/]*b/xyz", "b/yz", partial=True), None) + + # Git issue 546: Partial match not working in some instances with non-greedy capture + self.assertEqual(bool(regex.match(r'.*?', '<', partial=True)), True) + self.assertEqual(bool(regex.match(r'.*?', '.*?', '', partial=True)), True) + self.assertEqual(bool(regex.match(r'.*?', 'x', partial=True)), True) + self.assertEqual(bool(regex.match(r'.*?', 'xyz abc', partial=True)), True) + self.assertEqual(bool(regex.match(r'.*?', 'xyz abc foo', partial=True)), True) + self.assertEqual(bool(regex.match(r'.*?', 'xyz abc foo ', partial=True)), True) + self.assertEqual(bool(regex.match(r'.*?', 'xyz abc foo bar', partial=True)), True) + + def test_fuzzy_ext(self): + self.assertEqual(bool(regex.fullmatch(r'(?r)(?:a){e<=1:[a-z]}', 'e')), + True) + self.assertEqual(bool(regex.fullmatch(r'(?:a){e<=1:[a-z]}', 'e')), + True) + self.assertEqual(bool(regex.fullmatch(r'(?:a){e<=1:[a-z]}', '-')), + False) + self.assertEqual(bool(regex.fullmatch(r'(?r)(?:a){e<=1:[a-z]}', '-')), + False) + + self.assertEqual(bool(regex.fullmatch(r'(?:a){e<=1:[a-z]}', 'ae')), + True) + self.assertEqual(bool(regex.fullmatch(r'(?r)(?:a){e<=1:[a-z]}', + 'ae')), True) + self.assertEqual(bool(regex.fullmatch(r'(?:a){e<=1:[a-z]}', 'a-')), + False) + self.assertEqual(bool(regex.fullmatch(r'(?r)(?:a){e<=1:[a-z]}', + 'a-')), False) + + self.assertEqual(bool(regex.fullmatch(r'(?:ab){e<=1:[a-z]}', 'ae')), + True) + self.assertEqual(bool(regex.fullmatch(r'(?r)(?:ab){e<=1:[a-z]}', + 'ae')), True) + self.assertEqual(bool(regex.fullmatch(r'(?:ab){e<=1:[a-z]}', 'a-')), + False) + self.assertEqual(bool(regex.fullmatch(r'(?r)(?:ab){e<=1:[a-z]}', + 'a-')), False) + + self.assertEqual(bool(regex.fullmatch(r'(a)\1{e<=1:[a-z]}', 'ae')), + True) + self.assertEqual(bool(regex.fullmatch(r'(?r)\1{e<=1:[a-z]}(a)', + 'ea')), True) + self.assertEqual(bool(regex.fullmatch(r'(a)\1{e<=1:[a-z]}', 'a-')), + False) + self.assertEqual(bool(regex.fullmatch(r'(?r)\1{e<=1:[a-z]}(a)', + '-a')), False) + + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + 'ts')), True) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + 'st')), True) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + 'st')), True) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + 'ts')), True) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + '-s')), False) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + 's-')), False) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + 's-')), False) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(?:\N{LATIN SMALL LETTER SHARP S}){e<=1:[a-z]}', + '-s')), False) + + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(\N{LATIN SMALL LETTER SHARP S})\1{e<=1:[a-z]}', + 'ssst')), True) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(\N{LATIN SMALL LETTER SHARP S})\1{e<=1:[a-z]}', + 'ssts')), True) + self.assertEqual(bool(regex.fullmatch(r'(?firu)\1{e<=1:[a-z]}(\N{LATIN SMALL LETTER SHARP S})', + 'stss')), True) + self.assertEqual(bool(regex.fullmatch(r'(?firu)\1{e<=1:[a-z]}(\N{LATIN SMALL LETTER SHARP S})', + 'tsss')), True) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(\N{LATIN SMALL LETTER SHARP S})\1{e<=1:[a-z]}', + 'ss-s')), False) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(\N{LATIN SMALL LETTER SHARP S})\1{e<=1:[a-z]}', + 'sss-')), False) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(\N{LATIN SMALL LETTER SHARP S})\1{e<=1:[a-z]}', + '-s')), False) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(\N{LATIN SMALL LETTER SHARP S})\1{e<=1:[a-z]}', + 's-')), False) + + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(ss)\1{e<=1:[a-z]}', + '\N{LATIN SMALL LETTER SHARP S}ts')), True) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(ss)\1{e<=1:[a-z]}', + '\N{LATIN SMALL LETTER SHARP S}st')), True) + self.assertEqual(bool(regex.fullmatch(r'(?firu)\1{e<=1:[a-z]}(ss)', + 'st\N{LATIN SMALL LETTER SHARP S}')), True) + self.assertEqual(bool(regex.fullmatch(r'(?firu)\1{e<=1:[a-z]}(ss)', + 'ts\N{LATIN SMALL LETTER SHARP S}')), True) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(ss)\1{e<=1:[a-z]}', + '\N{LATIN SMALL LETTER SHARP S}-s')), False) + self.assertEqual(bool(regex.fullmatch(r'(?fiu)(ss)\1{e<=1:[a-z]}', + '\N{LATIN SMALL LETTER SHARP S}s-')), False) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(ss)\1{e<=1:[a-z]}', + 's-\N{LATIN SMALL LETTER SHARP S}')), False) + self.assertEqual(bool(regex.fullmatch(r'(?firu)(ss)\1{e<=1:[a-z]}', + '-s\N{LATIN SMALL LETTER SHARP S}')), False) + + def test_subscripted_captures(self): + self.assertEqual(regex.match(r'(?P.)+', + 'abc').expandf('{0} {0[0]} {0[-1]}'), 'abc abc abc') + self.assertEqual(regex.match(r'(?P.)+', + 'abc').expandf('{1} {1[0]} {1[1]} {1[2]} {1[-1]} {1[-2]} {1[-3]}'), + 'c a b c c b a') + self.assertEqual(regex.match(r'(?P.)+', + 'abc').expandf('{x} {x[0]} {x[1]} {x[2]} {x[-1]} {x[-2]} {x[-3]}'), + 'c a b c c b a') + + self.assertEqual(regex.subf(r'(?P.)+', r'{0} {0[0]} {0[-1]}', + 'abc'), 'abc abc abc') + self.assertEqual(regex.subf(r'(?P.)+', + '{1} {1[0]} {1[1]} {1[2]} {1[-1]} {1[-2]} {1[-3]}', 'abc'), + 'c a b c c b a') + self.assertEqual(regex.subf(r'(?P.)+', + '{x} {x[0]} {x[1]} {x[2]} {x[-1]} {x[-2]} {x[-3]}', 'abc'), + 'c a b c c b a') + + def test_more_zerowidth(self): + if sys.version_info >= (3, 7, 0): + self.assertEqual(regex.split(r'\b|:+', 'a::bc'), ['', 'a', '', '', + 'bc', '']) + self.assertEqual(regex.sub(r'\b|:+', '-', 'a::bc'), '-a---bc-') + self.assertEqual(regex.findall(r'\b|:+', 'a::bc'), ['', '', '::', + '', '']) + self.assertEqual([m.span() for m in regex.finditer(r'\b|:+', + 'a::bc')], [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)]) + self.assertEqual([m.span() for m in regex.finditer(r'(?m)^\s*?$', + 'foo\n\n\nbar')], [(4, 4), (4, 5), (5, 5)]) + + def test_line_ending(self): + self.assertEqual(regex.findall(r'\R', '\r\n\n\x0B\f\r\x85\u2028\u2029'), + ['\r\n', '\n', '\x0B', '\f', '\r', '\x85', '\u2028', '\u2029']) + self.assertEqual(regex.findall(br'\R', b'\r\n\n\x0B\f\r\x85'), [b'\r\n', + b'\n', b'\x0B', b'\f', b'\r']) + +def test_main(): + unittest.main(verbosity=2) + +if __name__ == "__main__": + test_main() diff --git a/buildVars.py b/buildVars.py index c1da588..8b5fd86 100644 --- a/buildVars.py +++ b/buildVars.py @@ -19,7 +19,7 @@ # Translators: Long description to be shown for this add-on on add-on information from add-ons manager "addon_description" : _("""This is the IBMTTS synthesizer driver for NVDA."""), # version - "addon_version" : "23.12.1", + "addon_version" : "24.12", # Author(s) "addon_author" : u"David CM , x0 and others", # URL for the add-on documentation support