Skip to content

Julia Regular Expressions

Regular expressions are powerful text pattern matching tools. Julia provides full regular expression support.

Creating Regular Expressions

Basic Syntax

julia
# Create regex using r"" syntax
pattern = r"hello"
println(typeof(pattern))  # Regex

# Match test
text = "hello world"
println(occursin(pattern, text))  # true

# With flags
pattern_i = r"hello"i  # Case insensitive
println(occursin(pattern_i, "HELLO"))  # true

Regular Expression Flags

julia
# i - Case insensitive
r"hello"i

# m - Multiline mode (^ and $ match each line)
r"^hello"m

# s - Single line mode (. matches newline)
r"a.b"s

# x - Extended mode (allows whitespace and comments)
r"""
  \d+    # Match digits
  \s*    # Optional whitespace
  [a-z]+ # Match letters
"""x

# Combine multiple flags
r"hello"ims

Basic Matching

occursin Check

julia
text = "Hello, Julia!"

# Simple check
println(occursin(r"Julia", text))   # true
println(occursin(r"Python", text))  # false

# Case insensitive
println(occursin(r"hello"i, text))  # true

match Function

julia
text = "My phone is 123-456-7890"

# Find first match
m = match(r"\d{3}-\d{3}-\d{4}", text)
println(m)          # RegexMatch("123-456-7890")
println(m.match)    # "123-456-7890"
println(m.offset)   # 13 (match start position)

# No match returns nothing
m = match(r"email", text)
println(m === nothing)  # true

eachmatch Iteration

julia
text = "apple: 5, banana: 3, cherry: 8"

# Find all numbers
for m in eachmatch(r"\d+", text)
    println(m.match)
end
# Output: 5, 3, 8

# Collect as array
matches = collect(eachmatch(r"\d+", text))
numbers = [m.match for m in matches]
println(numbers)  # ["5", "3", "8"]

Regular Expression Syntax

Character Classes

julia
text = "a1B2c3"

# Digits
println(collect(eachmatch(r"\d", text)))  # 1, 2, 3

# Letters
println(collect(eachmatch(r"[a-zA-Z]", text)))  # a, B, c

# Common character classes
# \d  - Digit [0-9]
# \D  - Non-digit
# \w  - Word character [a-zA-Z0-9_]
# \W  - Non-word character
# \s  - Whitespace
# \S  - Non-whitespace
# .   - Any character (except newline)

Quantifiers

julia
# ?   - 0 or 1
# *   - 0 or more
# +   - 1 or more
# {n} - Exactly n
# {n,} - At least n
# {n,m} - n to m

text = "aaa ab abbb"

println(match(r"ab*", text).match)   # "a" (b occurs 0 times)
println(match(r"ab+", text).match)   # "ab"
println(match(r"ab{2}", text).match) # "abb"

Anchors

julia
# ^  - Start of string
# $  - End of string
# \b - Word boundary

text = "Hello World"

println(occursin(r"^Hello", text))   # true
println(occursin(r"World$", text))   # true
println(occursin(r"\bWorld\b", text)) # true

# Multiline mode
multiline = "line1\nline2"
for m in eachmatch(r"^line\d"m, multiline)
    println(m.match)
end
# Output: line1, line2

Grouping

julia
text = "John Smith, Jane Doe"

# Capture groups
pattern = r"(\w+) (\w+)"
m = match(pattern, text)

println(m.match)      # "John Smith"
println(m.captures)   # ["John", "Smith"]
println(m[1])         # "John"
println(m[2])         # "Smith"

# Named capture groups
pattern = r"(?<first>\w+) (?<last>\w+)"
m = match(pattern, text)
println(m[:first])    # "John"
println(m[:last])     # "Smith"

Non-Capturing Groups

julia
# (?:...) Groups but doesn't capture
text = "Mr. John Smith"
pattern = r"(?:Mr\.|Mrs\.|Ms\.) (\w+) (\w+)"
m = match(pattern, text)
println(m.captures)  # ["John", "Smith"], doesn't include title

Alternation

julia
# | - Or
text = "I have a cat and a dog"

pattern = r"cat|dog"
for m in eachmatch(pattern, text)
    println(m.match)
end
# Output: cat, dog

String Replacement

replace Function

julia
text = "Hello, World!"

# Simple replacement
result = replace(text, r"World" => "Julia")
println(result)  # "Hello, Julia!"

# Case insensitive
result = replace("HELLO", r"hello"i => "hi")
println(result)  # "hi"

# Using capture groups
text = "2023-12-25"
result = replace(text, r"(\d{4})-(\d{2})-(\d{2})" => s"\2/\3/\1")
println(result)  # "12/25/2023"

# Using function
text = "hello world"
result = replace(text, r"\w+" => uppercase)
println(result)  # "HELLO WORLD"

Complex Replacement

julia
# Using function to process match
text = "Price: $100 and $200"

function double_price(m)
    price = parse(Int, m.match)
    return string(price * 2)
end

result = replace(text, r"\d+" => m -> string(parse(Int, m.match) * 2))
println(result)  # "Price: $200 and $400"

Splitting Strings

julia
text = "apple, banana; cherry: date"

# Split by multiple delimiters
parts = split(text, r"[,;:]\s*")
println(parts)  # ["apple", "banana", "cherry", "date"]

# Keep delimiters
parts = split(text, r"[,;:]", keepempty=true)
println(parts)

# Split by whitespace
text = "hello   world\tjulia"
parts = split(text, r"\s+")
println(parts)  # ["hello", "world", "julia"]

Practical Patterns

Validation Patterns

julia
# Email validation
function is_valid_email(email)
    pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
    return occursin(pattern, email)
end

println(is_valid_email("test@example.com"))   # true
println(is_valid_email("invalid-email"))      # false

# Phone number validation
function is_valid_phone(phone)
    pattern = r"^\d{3}-\d{3}-\d{4}$"
    return occursin(pattern, phone)
end

println(is_valid_phone("123-456-7890"))  # true
println(is_valid_phone("12345"))         # false

# URL validation
function is_valid_url(url)
    pattern = r"^https?://[\w\.-]+(?:/[\w\.-]*)*$"
    return occursin(pattern, url)
end

println(is_valid_url("https://example.com/path"))  # true

Extraction Patterns

julia
# Extract all numbers
function extract_numbers(text)
    return [parse(Int, m.match) for m in eachmatch(r"\d+", text)]
end

println(extract_numbers("I have 3 apples and 5 oranges"))
# [3, 5]

# Extract all words
function extract_words(text)
    return [m.match for m in eachmatch(r"\b\w+\b", text)]
end

println(extract_words("Hello, World!"))
# ["Hello", "World"]

# Extract key-value pairs
function extract_pairs(text)
    pairs = Dict{String, String}()
    for m in eachmatch(r"(\w+)=(\w+)", text)
        pairs[m[1]] = m[2]
    end
    return pairs
end

println(extract_pairs("name=Alice age=30"))
# Dict("name" => "Alice", "age" => "30")

Text Cleaning

julia
# Remove HTML tags
function strip_html(html)
    return replace(html, r"<[^>]+>" => "")
end

println(strip_html("<p>Hello <b>World</b></p>"))
# "Hello World"

# Normalize whitespace
function normalize_whitespace(text)
    return strip(replace(text, r"\s+" => " "))
end

println(normalize_whitespace("  hello   world  "))
# "hello world"

# Remove non-alphanumeric characters
function remove_special(text)
    return replace(text, r"[^\w\s]" => "")
end

println(remove_special("Hello, World! @2023"))
# "Hello World 2023"

Advanced Features

Backreferences

julia
# Match repeated words
text = "the the quick brown fox fox"
pattern = r"\b(\w+)\s+\1\b"

for m in eachmatch(pattern, text)
    println("Repeated word: $(m.match)")
end
# Output: "the the", "fox fox"

Lookahead and Lookbehind

julia
# Positive lookahead (?=...)
# Match "foo" followed by "bar"
text = "foobar foobaz"
pattern = r"foo(?=bar)"
m = match(pattern, text)
println(m.match)  # "foo"

# Negative lookahead (?!...)
# Match "foo" not followed by "bar"
pattern = r"foo(?!bar)"
m = match(pattern, text)
println(m.offset)  # 8 (second foo)

# Positive lookbehind (?<=...)
text = "USD100 EUR200"
pattern = r"(?<=USD)\d+"
m = match(pattern, text)
println(m.match)  # "100"

# Negative lookbehind (?<!...)
pattern = r"(?<!USD)\d+"  # Not preceded by USD
m = match(pattern, text)
println(m.match)  # "200"

Non-Greedy Matching

julia
text = "<div>content</div>"

# Greedy matching
m = match(r"<.*>", text)
println(m.match)  # "<div>content</div>"

# Non-greedy matching (add ?)
m = match(r"<.*?>", text)
println(m.match)  # "<div>"

Performance Optimization

Compile Once, Use Multiple Times

julia
# Store regex as constant
const EMAIL_PATTERN = r"^[\w\.-]+@[\w\.-]+\.\w+$"

function validate_email(email)
    return occursin(EMAIL_PATTERN, email)
end

# Batch validation
emails = ["a@b.com", "invalid", "x@y.org"]
for email in emails
    println("$email: $(validate_email(email))")
end

Avoid Overly Complex Patterns

julia
# Complex patterns can cause performance issues
# Use simpler patterns, process in steps if necessary

# Bad practice: one complex regex
# r"^(?=.*[A-Z])(?=.*[a-z])(?=.*\d).{8,}$"

# Better practice: check in steps
function is_strong_password(pwd)
    length(pwd) >= 8 || return false
    occursin(r"[A-Z]", pwd) || return false
    occursin(r"[a-z]", pwd) || return false
    occursin(r"\d", pwd) || return false
    return true
end

Common Regular Expressions

julia
# Integer
r"-?\d+"

# Float
r"-?\d+\.?\d*"

# Email
r"[\w\.-]+@[\w\.-]+\.\w+"

# URL
r"https?://[\w\.-]+(?:/[\w\.-]*)?"

# IP address
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"

# Date (YYYY-MM-DD)
r"\d{4}-\d{2}-\d{2}"

# Time (HH:MM:SS)
r"\d{2}:\d{2}:\d{2}"

# Chinese characters
r"[\u4e00-\u9fff]+"

Next Steps

After learning regular expressions, continue with:

Content is for learning and research only.