Julia Regular Expressions
Regular expressions are powerful text pattern matching tools. Julia provides full regular expression support.
Creating Regular Expressions
Basic Syntax
julia
# Create regex using r"" syntax
pattern = r"hello"
println(typeof(pattern)) # Regex
# Match test
text = "hello world"
println(occursin(pattern, text)) # true
# With flags
pattern_i = r"hello"i # Case insensitive
println(occursin(pattern_i, "HELLO")) # trueRegular Expression Flags
julia
# i - Case insensitive
r"hello"i
# m - Multiline mode (^ and $ match each line)
r"^hello"m
# s - Single line mode (. matches newline)
r"a.b"s
# x - Extended mode (allows whitespace and comments)
r"""
\d+ # Match digits
\s* # Optional whitespace
[a-z]+ # Match letters
"""x
# Combine multiple flags
r"hello"imsBasic Matching
occursin Check
julia
text = "Hello, Julia!"
# Simple check
println(occursin(r"Julia", text)) # true
println(occursin(r"Python", text)) # false
# Case insensitive
println(occursin(r"hello"i, text)) # truematch Function
julia
text = "My phone is 123-456-7890"
# Find first match
m = match(r"\d{3}-\d{3}-\d{4}", text)
println(m) # RegexMatch("123-456-7890")
println(m.match) # "123-456-7890"
println(m.offset) # 13 (match start position)
# No match returns nothing
m = match(r"email", text)
println(m === nothing) # trueeachmatch Iteration
julia
text = "apple: 5, banana: 3, cherry: 8"
# Find all numbers
for m in eachmatch(r"\d+", text)
println(m.match)
end
# Output: 5, 3, 8
# Collect as array
matches = collect(eachmatch(r"\d+", text))
numbers = [m.match for m in matches]
println(numbers) # ["5", "3", "8"]Regular Expression Syntax
Character Classes
julia
text = "a1B2c3"
# Digits
println(collect(eachmatch(r"\d", text))) # 1, 2, 3
# Letters
println(collect(eachmatch(r"[a-zA-Z]", text))) # a, B, c
# Common character classes
# \d - Digit [0-9]
# \D - Non-digit
# \w - Word character [a-zA-Z0-9_]
# \W - Non-word character
# \s - Whitespace
# \S - Non-whitespace
# . - Any character (except newline)Quantifiers
julia
# ? - 0 or 1
# * - 0 or more
# + - 1 or more
# {n} - Exactly n
# {n,} - At least n
# {n,m} - n to m
text = "aaa ab abbb"
println(match(r"ab*", text).match) # "a" (b occurs 0 times)
println(match(r"ab+", text).match) # "ab"
println(match(r"ab{2}", text).match) # "abb"Anchors
julia
# ^ - Start of string
# $ - End of string
# \b - Word boundary
text = "Hello World"
println(occursin(r"^Hello", text)) # true
println(occursin(r"World$", text)) # true
println(occursin(r"\bWorld\b", text)) # true
# Multiline mode
multiline = "line1\nline2"
for m in eachmatch(r"^line\d"m, multiline)
println(m.match)
end
# Output: line1, line2Grouping
julia
text = "John Smith, Jane Doe"
# Capture groups
pattern = r"(\w+) (\w+)"
m = match(pattern, text)
println(m.match) # "John Smith"
println(m.captures) # ["John", "Smith"]
println(m[1]) # "John"
println(m[2]) # "Smith"
# Named capture groups
pattern = r"(?<first>\w+) (?<last>\w+)"
m = match(pattern, text)
println(m[:first]) # "John"
println(m[:last]) # "Smith"Non-Capturing Groups
julia
# (?:...) Groups but doesn't capture
text = "Mr. John Smith"
pattern = r"(?:Mr\.|Mrs\.|Ms\.) (\w+) (\w+)"
m = match(pattern, text)
println(m.captures) # ["John", "Smith"], doesn't include titleAlternation
julia
# | - Or
text = "I have a cat and a dog"
pattern = r"cat|dog"
for m in eachmatch(pattern, text)
println(m.match)
end
# Output: cat, dogString Replacement
replace Function
julia
text = "Hello, World!"
# Simple replacement
result = replace(text, r"World" => "Julia")
println(result) # "Hello, Julia!"
# Case insensitive
result = replace("HELLO", r"hello"i => "hi")
println(result) # "hi"
# Using capture groups
text = "2023-12-25"
result = replace(text, r"(\d{4})-(\d{2})-(\d{2})" => s"\2/\3/\1")
println(result) # "12/25/2023"
# Using function
text = "hello world"
result = replace(text, r"\w+" => uppercase)
println(result) # "HELLO WORLD"Complex Replacement
julia
# Using function to process match
text = "Price: $100 and $200"
function double_price(m)
price = parse(Int, m.match)
return string(price * 2)
end
result = replace(text, r"\d+" => m -> string(parse(Int, m.match) * 2))
println(result) # "Price: $200 and $400"Splitting Strings
julia
text = "apple, banana; cherry: date"
# Split by multiple delimiters
parts = split(text, r"[,;:]\s*")
println(parts) # ["apple", "banana", "cherry", "date"]
# Keep delimiters
parts = split(text, r"[,;:]", keepempty=true)
println(parts)
# Split by whitespace
text = "hello world\tjulia"
parts = split(text, r"\s+")
println(parts) # ["hello", "world", "julia"]Practical Patterns
Validation Patterns
julia
# Email validation
function is_valid_email(email)
pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
return occursin(pattern, email)
end
println(is_valid_email("test@example.com")) # true
println(is_valid_email("invalid-email")) # false
# Phone number validation
function is_valid_phone(phone)
pattern = r"^\d{3}-\d{3}-\d{4}$"
return occursin(pattern, phone)
end
println(is_valid_phone("123-456-7890")) # true
println(is_valid_phone("12345")) # false
# URL validation
function is_valid_url(url)
pattern = r"^https?://[\w\.-]+(?:/[\w\.-]*)*$"
return occursin(pattern, url)
end
println(is_valid_url("https://example.com/path")) # trueExtraction Patterns
julia
# Extract all numbers
function extract_numbers(text)
return [parse(Int, m.match) for m in eachmatch(r"\d+", text)]
end
println(extract_numbers("I have 3 apples and 5 oranges"))
# [3, 5]
# Extract all words
function extract_words(text)
return [m.match for m in eachmatch(r"\b\w+\b", text)]
end
println(extract_words("Hello, World!"))
# ["Hello", "World"]
# Extract key-value pairs
function extract_pairs(text)
pairs = Dict{String, String}()
for m in eachmatch(r"(\w+)=(\w+)", text)
pairs[m[1]] = m[2]
end
return pairs
end
println(extract_pairs("name=Alice age=30"))
# Dict("name" => "Alice", "age" => "30")Text Cleaning
julia
# Remove HTML tags
function strip_html(html)
return replace(html, r"<[^>]+>" => "")
end
println(strip_html("<p>Hello <b>World</b></p>"))
# "Hello World"
# Normalize whitespace
function normalize_whitespace(text)
return strip(replace(text, r"\s+" => " "))
end
println(normalize_whitespace(" hello world "))
# "hello world"
# Remove non-alphanumeric characters
function remove_special(text)
return replace(text, r"[^\w\s]" => "")
end
println(remove_special("Hello, World! @2023"))
# "Hello World 2023"Advanced Features
Backreferences
julia
# Match repeated words
text = "the the quick brown fox fox"
pattern = r"\b(\w+)\s+\1\b"
for m in eachmatch(pattern, text)
println("Repeated word: $(m.match)")
end
# Output: "the the", "fox fox"Lookahead and Lookbehind
julia
# Positive lookahead (?=...)
# Match "foo" followed by "bar"
text = "foobar foobaz"
pattern = r"foo(?=bar)"
m = match(pattern, text)
println(m.match) # "foo"
# Negative lookahead (?!...)
# Match "foo" not followed by "bar"
pattern = r"foo(?!bar)"
m = match(pattern, text)
println(m.offset) # 8 (second foo)
# Positive lookbehind (?<=...)
text = "USD100 EUR200"
pattern = r"(?<=USD)\d+"
m = match(pattern, text)
println(m.match) # "100"
# Negative lookbehind (?<!...)
pattern = r"(?<!USD)\d+" # Not preceded by USD
m = match(pattern, text)
println(m.match) # "200"Non-Greedy Matching
julia
text = "<div>content</div>"
# Greedy matching
m = match(r"<.*>", text)
println(m.match) # "<div>content</div>"
# Non-greedy matching (add ?)
m = match(r"<.*?>", text)
println(m.match) # "<div>"Performance Optimization
Compile Once, Use Multiple Times
julia
# Store regex as constant
const EMAIL_PATTERN = r"^[\w\.-]+@[\w\.-]+\.\w+$"
function validate_email(email)
return occursin(EMAIL_PATTERN, email)
end
# Batch validation
emails = ["a@b.com", "invalid", "x@y.org"]
for email in emails
println("$email: $(validate_email(email))")
endAvoid Overly Complex Patterns
julia
# Complex patterns can cause performance issues
# Use simpler patterns, process in steps if necessary
# Bad practice: one complex regex
# r"^(?=.*[A-Z])(?=.*[a-z])(?=.*\d).{8,}$"
# Better practice: check in steps
function is_strong_password(pwd)
length(pwd) >= 8 || return false
occursin(r"[A-Z]", pwd) || return false
occursin(r"[a-z]", pwd) || return false
occursin(r"\d", pwd) || return false
return true
endCommon Regular Expressions
julia
# Integer
r"-?\d+"
# Float
r"-?\d+\.?\d*"
# Email
r"[\w\.-]+@[\w\.-]+\.\w+"
# URL
r"https?://[\w\.-]+(?:/[\w\.-]*)?"
# IP address
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
# Date (YYYY-MM-DD)
r"\d{4}-\d{2}-\d{2}"
# Time (HH:MM:SS)
r"\d{2}:\d{2}:\d{2}"
# Chinese characters
r"[\u4e00-\u9fff]+"Next Steps
After learning regular expressions, continue with: