Programming Resources
For Fun and Learning
Charles Cusack
Computer Science
Hope College
main

Python
C++

JAVA


PHP
SQL
Assignments

RegularExpressions


PassageParser.java

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This is code that was stripped out of my solution, with portions of my code removed. This is an example of how you
 * might parse a passage that is input from the user. This is not integrated into the BibleReader code so it doesn't
 * validate references (so Herman 2:3 would be parsed correctly, but this code does not check whether or not Herman 2:3
 * is actually a valid reference). It also leaves some cases out--I can't give you everything. You may use and modify
 * this code as you desire, but make sure you attribute it to the original author.
 * 
 * @author Chuck Cusack, February 2013.
 */

public class PassageParser {
// The regular expressions used to parse input.
// Learn more about Pattern and regular expressions here:
// http://java.sun.com/javase/6/docs/api/java/util/regex/Pattern.html
// The first thing to know is that the \ character is an escape character, and when seen in a regular
// expression, it means "the next character should be interpreted differently than normal".
// Thus, since \ normally means escape character, to produce a \, we need to escape it, so we use \\.
// Thus, every time you see \\ in a regular expression string, imagine it is really just a \.
//
// . means match any single character
// | means match either or, so "blah|foo" looks for either blah or foo.
// \s means a whitespace character (space, tab, newline, carriage return, etc.)
// \w means word character
// \d means any digit (0-9).
// * means 0 or more occurrences of the previous token. In this case the matching is "greedy",
// which essentially means "match as much of the string as you can with this pattern".
// This is a little technical, so see the above link for more details about this.
// Thus, \s* means 0 or more whitespace characters
// + means 1 or more occurrences of the previous token.
// Thus, \d+ means 1 or more number. This will capture an integer.
// () around something means treat it as a capture group which can be asked about later.
// For instance, given the regular expression
// "blah(\\d+)foo(\\s*)"
// If we try to match it given the string
// "stuff blah123foo other stuff"
// it will match the pattern starting at "blah", and group 1 is "123", and group 2 is " ".
// Group 0 is the whole matching portion, so in this case "blah123foo ".
// Below, I give the interpretation of some of the regular expressions I use.
// By the way, these are the sorts of comments you should consider removing when you
// copy code.  They are meant to help you understand the code and not necessarily to document it.

// Match zero or more whitespace characters followed by one or more numbers followed by
// zero or more whitespace characters.
// So, for instance "  234      " and "12 " will match, but "   123 44 " will not.

public static final String    number        = "\\s*(\\d+)\\s*";

//---------------------------------------------------------------------------------------------------------
// If you need to use parentheses to group something, but don't want it to count as a
// capturing group, you put "?: at the start.  For instance, "(?:1|2)" means match "1" or "2",
// but do not count this as a group. I use this below.
//
// The next one is trying to match things that start with a book name and then (possibly) other stuff.
// More specifically, match either
// (a) zero or more whitespaces characters followed by either a 1, 2, 3, I, II, or III,
// then 0 or more whitespaces, then 1 or more word characters, OR
// (b) 1 or more words consisting of only alphabetic characters.
// Then following either (a) or (b) is 0 or more whitespace characters followed by zero or more of any character.
//
// Thus, given a reference to a passage, this will capture the name of the book as group 1 (notice
// the location of parentheses) and everything else as group 2.
// For instance:
// "1 John 3:4" will match "1 John" as group 1 and "3:4" as group 2.
// "   3   Peter   3:1-5" will match "3   Peter" as group 1 and "3:1-5" as group 2.
// "1John 2:3" will not match because there is no space between the "1" and the "John".

public static Pattern bookPattern = Pattern.compile("\\s*((?:1|2|3|I|II|III)\\s*\\w+|(?:\\s*[a-zA-Z]+)+)\\s*(.*)");

//---------------------------------------------------------------------------------------------------------
// Two examples of patterns that are valid. More are needed.
// This one matches things like "3:4-7:3"
public static Pattern  cvcvPattern= Pattern.compile(number + ":" + number + "-" + number + ":" + number);
// This one matches things like "3-5"
public static Pattern  ccPattern  = Pattern.compile(number + "-" + number);

public void getPassage(String passageRef) {
    String theRest = null;
    String book = null;
    int chapter1, chapter2, verse1, verse2;

    // First, split the input into the book and the rest, if possible.
    Matcher m = bookPattern.matcher(passageRef);
    
    // Now see if it matches.
    if (m.matches()) {
        // It matches.  Good.
        book = m.group(1);
        theRest = m.group(2);
        // Now we need to parse theRest to see what format it is.
        // Notice that I have omitted some of the cases.
        // You should think about whether or not the order the
        // possibilities occurs matters if you use this and add more cases.
        if (theRest.length() == 0) {
            // It looks like they want a whole book.
            // So now you need to do something about it.
        } else if ((m = cvcvPattern.matcher(theRest)).matches()) {
            chapter1 = Integer.parseInt(m.group(1));
            verse1 = Integer.parseInt(m.group(2));
            chapter2 = Integer.parseInt(m.group(3));
            verse2 = Integer.parseInt(m.group(4));
            // They want something of the form book chapter1:verse1-chapter2:verse2
            // So now you need to do something about it.
        } else if ((m = ccPattern.matcher(theRest)).matches()) {
            chapter1 = Integer.parseInt(m.group(1));
            chapter2 = Integer.parseInt(m.group(2));
            // They want something of the form book chapter1-chapter2
            // So now you need to do something about it.
        } else {
            // They want something else that I haven't taken into account yet.
        }
    } else {
        // It doesn't match the overall format of "BOOK Stuff".
    }
}
}