Regular expression examples

  • Addresses
    
    //Address: State code (US)
    '/\b(?:A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY])\b/'
    
    //Address: ZIP code (US)
    'b[0-9]{5}(?:-[0-9]{4})?b'
    
    Columns
    
    //Columns: Match a regex starting at a specific column on a line.
    '^.{%SKIPAMOUNT%}(%REGEX%)'
    
    //Columns: Range of characters on a line, captured into backreference 1
    //Iterate over all matches to extract a column of text from a file
    //E.g. to grab the characters in colums 8..10, set SKIPAMOUNT to 7, and CAPTUREAMOUNT to 3
    '^.{%SKIPAMOUNT%}(.{%CAPTUREAMOUNT%})'
    
    Credit cards
    
    //Credit card: All major cards
    '^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6011[0-9]{12}|3(?:0[0-5]|[68][0-9])[0-9]{11}|3[47][0-9]{13})$'
    
    //Credit card: American Express
    '^3[47][0-9]{13}$'
    
    //Credit card: Diners Club
    '^3(?:0[0-5]|[68][0-9])[0-9]{11}$'
    
    //Credit card: Discover
    '^6011[0-9]{12}$'
    
    //Credit card: MasterCard
    '^5[1-5][0-9]{14}$'
    
    //Credit card: Visa
    '^4[0-9]{12}(?:[0-9]{3})?$'
    
    //Credit card: remove non-digits
    '/[^0-9]+/'
    
    CSV
    
    //CSV: Change delimiter
    //Changes the delimiter from a comma into a tab.
    //The capturing group makes sure delimiters inside double-quoted entries are ignored.
    '("[^"rn]*")?,(?![^",rn]*"$)'
    
    //CSV: Complete row, all fields.
    //Match complete rows in a comma-delimited file that has 3 fields per row, 
    //capturing each field into a backreference.  
    //To match CSV rows with more or fewer fields, simply duplicate or delete the capturing groups.
    '^("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*)$'
    
    //CSV: Complete row, certain fields.
    //Set %SKIPLEAD% to the number of fields you want to skip at the start, and %SKIPTRAIL% to 
    //the number of fields you want to ignore at the end of each row.  
    //This regex captures 3 fields into backreferences.  To capture more or fewer fields, 
    //simply duplicate or delete the capturing groups.
    '^(?:(?:"[^"rn]*"|[^,rn]*),){%SKIPLEAD%}("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*)(?:(?:"[^"rn]*"|[^,rn]*),){%SKIPTRAIL%}$'
    
    //CSV: Partial row, certain fields
    //Match the first SKIPLEAD+3 fields of each rows in a comma-delimited file that has SKIPLEAD+3 
    //or more fields per row.  The 3 fields after SKIPLEAD are each captured into a backreference.  
    //All other fields are ignored.  Rows that have less than SKIPLEAD+3 fields are skipped.  
    //To capture more or fewer fields, simply duplicate or delete the capturing groups.
    '^(?:(?:"[^"rn]*"|[^,rn]*),){%SKIPLEAD%}("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*)'
    
    //CSV: Partial row, leading fields
    //Match the first 3 fields of each rows in a comma-delimited file that has 3 or more fields per row.  
    //The first 3 fields are each captured into a backreference.  All other fields are ignored.  
    //Rows that have less than 3 fields are skipped.  To capture more or fewer fields, 
    //simply duplicate or delete the capturing groups.
    '^("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*)'
    
    //CSV: Partial row, variable leading fields
    //Match the first 3 fields of each rows in a comma-delimited file.  
    //The first 3 fields are each captured into a backreference.
    //All other fields are ignored.  If a row has fewer than 3 field, some of the backreferences 
    //will remain empty.  To capture more or fewer fields, simply duplicate or delete the capturing groups.  
    //The question mark after each group makes that group optional.
    '^("[^"rn]*"|[^,rn]*),("[^"rn]*"|[^,rn]*)?,("[^"rn]*"|[^,rn]*)?'
    
    Dates
    
    //Date d/m/yy and dd/mm/yyyy
    //1/1/00 through 31/12/99 and 01/01/1900 through 31/12/2099
    //Matches invalid dates such as February 31st
    'b(0?[1-9]|[12][0-9]|3[01])[- /.](0?[1-9]|1[012])[- /.](19|20)?[0-9]{2}b'
    
    //Date dd/mm/yyyy
    //01/01/1900 through 31/12/2099
    //Matches invalid dates such as February 31st
    '(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)[0-9]{2}'
    
    //Date m/d/y and mm/dd/yyyy
    //1/1/99 through 12/31/99 and 01/01/1900 through 12/31/2099
    //Matches invalid dates such as February 31st
    //Accepts dashes, spaces, forward slashes and dots as date separators
    'b(0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])[- /.](19|20)?[0-9]{2}b'
    
    //Date mm/dd/yyyy
    //01/01/1900 through 12/31/2099
    //Matches invalid dates such as February 31st
    '(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)[0-9]{2}'
    
    //Date yy-m-d or yyyy-mm-dd
    //00-1-1 through 99-12-31 and 1900-01-01 through 2099-12-31
    //Matches invalid dates such as February 31st
    'b(19|20)?[0-9]{2}[- /.](0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])b'
    
    //Date yyyy-mm-dd
    //1900-01-01 through 2099-12-31
    //Matches invalid dates such as February 31st
    '(19|20)[0-9]{2}[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
    
    Delimiters
    
    //Delimiters: Replace commas with tabs
    //Replaces commas with tabs, except for commas inside double-quoted strings
    '((?:"[^",]*+")|[^,]++)*+,'
    
    Email addresses
    
    //Email address
    //Use this version to seek out email addresses in random documents and texts.
    //Does not match email addresses using an IP address instead of a domain name.
    //Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.  
    //Including these increases the risk of false positives when applying the regex to random documents.
    'b[A-Z0-9._%-]+@[A-Z0-9.-]+.[A-Z]{2,4}b'
    
    //Email address (anchored)
    //Use this anchored version to check if a valid email address was entered.
    //Does not match email addresses using an IP address instead of a domain name.
    //Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.
    //Requires the "case insensitive" option to be ON.
    '^[A-Z0-9._%-]+@[A-Z0-9.-]+.[A-Z]{2,4}$'
    
    //Email address (anchored; no consecutive dots)
    //Use this anchored version to check if a valid email address was entered.
    //Improves on the original email address regex by excluding addresses with consecutive dots such as john@aol...com
    //Does not match email addresses using an IP address instead of a domain name.
    //Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.  
    //Including these increases the risk of false positives when applying the regex to random documents.
    '^[A-Z0-9._%-]+@(?:[A-Z0-9-]+.)+[A-Z]{2,4}$'
    
    //Email address (no consecutive dots)
    //Use this version to seek out email addresses in random documents and texts.
    //Improves on the original email address regex by excluding addresses with consecutive dots such as john@aol...com
    //Does not match email addresses using an IP address instead of a domain name.
    //Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.  
    //Including these increases the risk of false positives when applying the regex to random documents.
    'b[A-Z0-9._%-]+@(?:[A-Z0-9-]+.)+[A-Z]{2,4}b'
    
    //Email address (specific TLDs)
    //Does not match email addresses using an IP address instead of a domain name.
    //Matches all country code top level domains, and specific common top level domains.
    '^[A-Z0-9._%-]+@[A-Z0-9.-]+.(?:[A-Z]{2}|com|org|net|biz|info|name|aero|biz|info|jobs|museum|name)$'
    
    //Email address: Replace with HTML link
    'b(?:mailto:)?([A-Z0-9._%-]+@[A-Z0-9.-]+.[A-Z]{2,4})b'
    
    HTML
    
    //HTML comment
    ''
    
    //HTML file
    //Matches a complete HTML file.  Place round brackets around the .*? parts you want to extract from the file.
    //Performance will be terrible on HTML files that miss some of the tags 
    //(and thus won't be matched by this regular expression).  Use the atomic version instead when your search 
    //includes such files (the atomic version will also fail invalid files, but much faster).
    '.*?.*?.*?.*?.*?]*>.*?.*?'
    
    //HTML file (atomic)
    //Matches a complete HTML file.  Place round brackets around the .*? parts you want to extract from the file.
    //Atomic grouping maintains the regular expression's performance on invalid HTML files.
    '(?>.*?)(?>.*?)(?>.*?)(?>.*?)(?>.*?]*>)(?>.*?).*?'
    
    //HTML tag
    //Matches the opening and closing pair of whichever HTML tag comes next.
    //The name of the tag is stored into the first capturing group.
    //The text between the tags is stored into the second capturing group.
    '<([A-Z][A-Z0-9]*)[^>]*>(.*?)'
    
    //HTML tag
    //Matches the opening and closing pair of a specific HTML tag.
    //Anything between the tags is stored into the first capturing group.
    //Does NOT properly match tags nested inside themselves.
    '<%TAG%[^>]*>(.*?)'
    
    //HTML tag
    //Matches any opening or closing HTML tag, without its contents.
    ']*>'