Overview
Regular expressions allow you to find patterns in strings and work with them. They are an invaluable tool and allow a simpler and faster implementation for many non-trivial tasks on strings.
Java supports regular expressions since version 1.4.
This tutorial uses the assert statement to explain the topic. If you don't fully understand assert yet, read Java Basics: Assert first.
Regular Expressions
Regular expressions are strings which may contain certain control sequences. However, they don't have to. The regular expressions in the first examples don't have any control sequences, and thus they match only if they are equal to the input string.
String s = "foo bar bar"; // input string
assert s.matches("foo bar bar"); // regular expression in red (more)
assert !s.matches("dog"); // not equal, no match (more)
assert s.replaceFirst("foo", "bar").equals("bar bar bar"); // replace first"foo"
assert s.replaceFirst("bar", "dog").equals("foo dog bar"); // replace first "bar"
assert s.replaceAll("foo", "bar").equals("bar bar bar"); // replace all "foo"
assert s.replaceAll("bar", "dog").equals("foo dog dog"); // replace all "bar"
String a = "abc";
String b = "xyz";
assert a.matches("abc|xyz"); // "|" means or (more)
assert b.matches("abc|xyz"); // matches "abc" and "xyz"
assert !"123".matches("abc|xyz"); // "123" does not match
assert a.replaceFirst("b|z", "O").equals("aOc");
assert b.replaceFirst("b|z", "O").equals("xyO");
assert a.replaceAll("a|b|c", "O").equals("OOO");
assert a.matches("(a)(b)(c)"); // "()" defines a capturing group (more)
assert a.matches("((a)bc)"); // nested groups are possible
String c = "cold";
assert c.matches("(b|c)old"); // or and groups combined
assert c.matches("ice|((b|c)ol(d|t))|hot");
assert c.replaceAll("o(d|l)", "XX").equals("cXXd");
String d = "(|)";
assert d.matches("\\(\\|\\)"); // escaping (more)
String a = "abc";
assert a.matches("a[bgh]c"); // "[]" defines a character class(more)
assert a.replaceAll("[abc]", "X").equals("XXX");
assert a.matches("[a-z][a-z][a-z]"); // "-" defines a character range
assert a.replaceAll("[b-f]", "X").equals("aXX");
String b = "I love Java";
assert b.replaceAll("[a-zA-Z]", "X").equals("X XXXX XXXX");
assert b.replaceAll("[oa-cJ]", "X").equals("I lXve XXvX");
assert a.matches("[[^x-z][^k][^st-w]"); // "^" means "all other characters match" (more)
assert a.replaceAll("[[^b]", "X").equals("XbX");
assert "x".matches("."); // "." matches any character except newline (more)
assert a.matches("...");
assert b.matches("I.l.v. .a.a");
assert b.replaceAll(".", "X").equals("XXXXXXXXXXX");
assert a.matches("\\w\\w\\w"); // "\\w" is short for "[a-zA-Z_0-9]" (more)
assert b.replaceAll("\\w", "X").equals("X XXXX XXXX");
assert " ".matches("\\s"); // "\\s" matches whitespace ("[ \t\n\x0B\f\r]") (more)
assert b.replaceAll("\\s", "X").equals("IXloveXJava");
assert "123".matches("\\d\\d\\d"); // "\\d" is short for "[0-9]" (more)
assert !a.matches("\\W\\W\\W"); // "\\W" inverts "\\w", same as "[^a-zA-Z_0-9]" (more)
assert b.replaceAll("\\S", "X").equals("X XXXX XXXX");
assert "a1".replaceAll("\\D", "X").equals("X1");
String a = "b ba baa baaa";
assert a.replaceAll("ba*", "X").equals("X X X X"); // "*" means "zero or more" (more)
assert a.matches("ba* ba* ba* ba*");
assert a.replaceAll("b(aa)*, "X").equals("X ba X baaa"); // apply "*" to group "aa"
assert a.replaceAll("b[a-z]*", "X").equals("X X X X");
assert a.matches("(ba* *)*"); // complex pattern (more)
assert a.replaceAll("ba+", "X").equals("b X X X"); // "+" means "one or more" (more)
assert a.matches("b ba+ ba+ ba+");
assert a.replaceAll("b(aa)+, "X").equals("b ba X baaa"); // apply "+" to group "aa"
assert a.replaceAll("b[a-z]+", "X").equals("b X X X");
assert a.matches("b( ba+)+"); // complex pattern (more)
assert a.replaceAll("ba?", "X").equals("X X Xa Xaa"); // "?" means optional (0 or 1) (more)
assert a.replaceAll("b(aa)?, "X").equals("X ba X Xa"); // apply "?" to group "aa"
assert a.matches("(ba* ?)+"); // complex pattern
assert a.replaceAll("ba{2}", "X").equals("b ba X baaa"); // "{2}" means "twice" (more)
assert a.replaceAll("ba{2,3}", "X").equals("b ba X X"); // "{2,3}" means "2-3 times" (more)
assert a.replaceAll("ba{2,}", "X").equals("b ba X X"); // "{2,}" means "at least twice" (more)
String a = "aaa";
String b = "abc aca";
assert a.replaceAll("^a", "X").equals("Xaa"); // "^" means "beginning of input/line" (more)
assert a.matches("^aaa");
assert !b.matches("^aca");
assert a.replaceAll("a$", "X").equals("aaa"); // "$" means "end of input/line" (more)
assert a.matches("^aaa$");
assert !b.matches("abc$");
assert b.replaceAll("\\ba", "X").equals("Xbc Xca"); // "\b" means "at word boundary" (more)
assert b.replaceAll("\\Ba", "X").equals("abc acX"); // "\B" means "not at word boundary" (more)
String a = "aBcDeF";
String b = "A\nB\nC";
assert a.replaceAll("(?i)[a-f]", "X").equals("XXXXXX"); // (?i) means CASE_INSENSITIVE (more)
assert b.matches("(?s).+"); // (?s) enables DOTALL mode (more)
assert b.matches("(?is)[a-z].[a-z].[a-z]"); // (?is) combines 'i' and 's' (more)
assert !b.matches("(?is)[a-z].[a-z].(?-i)[a-z]"); // (?-i) disables the i flag (more)
assert b.replaceAll("(?m)^\\w", "X").equals("X\nX\nX"); // (?m) enables MUTLILINE mode (more)
Using Regular Expressions
String a = "I love Java";
assert a.matches("I love Java"); // match whole string
assert a.matches(".*love.*"); // contains 'love'? (more)
// Escaping a search string (more)
String anyString = ...; // works with any string
boolean containsString = a.matches(".*" + Matcher.quoteReplacement(anyString) + ".*");
// Find all numbers in a string:
String a = "6 times 7 is 42";
Pattern pattern = Pattern.compile("[0-9]+"); // compile pattern (more)
Matcher m = pattern.matcher(a); // create Matcher (more)
while (m.find()) { // loop through all matches (more)
System.out.println(m.group() + // print the match/number
" at "+ m.start() + " - " + m.end()) // print position (more)
}
// Parse an email address:
String b = "tom@example.org";
Pattern emailPtrn = Pattern.compile("([^@]+)@([^@]+)"); // Pattern with capturing groups (more)
Matcher em = emailPtrn.matcher(b)
if (em.matches(b)); // match whole string (more)
System.out.println("Name=" + em.group(1) + " Host=" + em.group(2));
String a = "r2d2";
assert a.replaceAll("[a-z]", "00").equals("002002");
assert a.replaceFirst("[a-z]", "00").equals("002d2");
Pattern p = Pattern.compile("[a-z]"); // with pre-compiled Pattern (more)
Matcher m = p.matcher(a);
assert m.replaceAll("00").equals("002002");
assert m.replaceFirst("00").equals("002d2");
// Replace all dates in format yyyy-mm-dd with ddmmyyyy in the string:
String a = "2009/12/24, 2009/12/31, 2010/01/01";
Pattern p = Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2})");
Matcher m = p.matcher(a);
StringBuffer result = new StringBuffer();
while (m.find()) // loop through all results
m.appendReplacement(result, m.group(3)+m.group(2)+m.group(3)); // add replacement (more)
m.appendTail(result); // important! (more)
System.out.println(result.toString());
String a = "42, 17, 24, 5, 175";
String[] r = a.split("[ ,]+"); // comma and space split (more)
assert r.length == 5; // split into 5 substrings
assert r[0].equals("42") && r[1].equals("17"); // ...
String b = "-3--2";
String[] r2 = b.split("-");
assert r2.length == 4;
assert r2[0].equals("") && r2[2].equals(""); // substrings may be empty
String[] r3 = a.split("[ ,]+", 3); // limit to 3 (more)
assert r3.length == 3; // limited array size
assert r3[2].equals("24, 5, 175"); // contains remaining text
String c = "1, 2,, 3,,,";
String[] r4 = a.split("[ ,]+", 0); // remove empty strings at the end (more)
assert r4.length == 4; // empty strings at the end removed
assert r4[2].equals(""); // other empty strings remain
// Pattern has exact equivalents of String's split methods:
Pattern p = Pattern.compile(("[ ,]+");
assert p.split(a).length == 5;
assert p.split(a, 3).length == 3; // with limit
// Case-insensitive matching:
String a = "I lOvE jAvA";
Pattern p = Pattern.compile(".*love.*", Pattern.CASE_INSENSITIVE); // ASCII only! (more)
assert p.matcher(a).matches();
// Find all lines containing numbers:
String b = "23\naSd\n5\nLOVE\n\NYE\n3\n";
Pattern p2 = Pattern.compile("^\\d+$", Pattern.MULTILINE); // multiline (more)
Matcher m2 = p2.matcher(b);
while (m2.find())
System.out.println(m2.group()); // print numbers 23, 5 and 3
// Patterns that may span several lines:
String c = "a\n\b\nc";
Pattern p3 = Pattern.compile("a.*b.*c", Pattern.DOTALL); // "." matches newlines (more)
assert p3.matcher(c).matches();
assert !c.matches("a.*b.*c"); // no match without DOTALL
// Multiple flags:
Pattern p4 = Pattern.compile(".*love.*",
Pattern.CASE_INSENSITIVE | DOTALL); // combining flags (more)
assert p4.matcher(b).matches();
Advanced Regular Expressions
Regular expression quantifiers are greedy by default. This means that each quantifier tries to get as many repetitions as possible. As the regular expression is executed from left to right, this means that the first quantifiers may get more repetitions than the following ones. In some situations, this is not desirable, and that's what non-greedy quantifiers are for. They try to match as few repetitions as possible while still fulfilling the pattern as a whole.
In order to get a non-greedy quantifier, just append a "?", e.g. write "*?" instead of just "*".
String a = "a,b,c";
Matcher m1 = Pattern.compile("(.*),(.*)").matcher(a); // greedy quantifiers
if (m1.matches())
System.out.println(m1.group(1) + " " + m1.group(2)); // prints "a,b c"
Matcher m2 = Pattern.compile("(.*?),(.*?)").matcher(a); // non-greedy quantifiers
if (m2.matches())
System.out.println(m2.group(1) + " " + m2.group(2)); // prints "a b,c"
Lookahead assertions allow you to state that a pattern only matches if it is followed by another sub-pattern. This sub-pattern itself is not part of the actual match though.
String a = "ab ac ad";
String b = "pineapple";
assert a.replaceAll("a(?=c)", "X").equals("ab Xc ad"); // Match "a" followed by "c" (more)
assert a.replaceAll("a(?!c)", "X").equals("Xb ac Xd"); // Match "a" not followed by "c" (more)
assert b.replaceAll("[a-z](?=[eip])", "X").equals("XiXeXXpXe");
assert b.replaceAll("[a-z](?![eip])", "X").equals("pXnXapXlX");
assert "5".replaceFirst("^(?=\\d$)", "0").equals("05"); // prepend 0 to single-digit strings
// Lookahead can be everywhere in the pattern (more):
assert b.replaceAll("[a-z](?=[eip])[a-e]", "X").equals("piXappX"");
Similar to lookahead assertions, lookbehind assertions allow you to state a sub-pattern that must be in front of the main pattern. The sub-pattern itself is not part of the match.
String a = "ba ca da";
assert a.replaceAll("(?<=c)a", "X").equals("ba cX da"); // Match "a" following "c" (more)
assert a.replaceAll("(?<!c)a", "X").equals("bX ca dX"); // Match "a" not following "c" (more)
You can find more information on lookahead and lookbehind in the Regex Tutorial.