Splitting at whitespaces with google-diff-match-patch

Matas Šeimys

Apr 18, 2013 • 1 min read

This simple Python program example produces nice human readable word differences splitting at whitespaces, be it space, tab, or newline:

from diff_match_patch import diff_match_patch

dmp = diff_match_patch()
c1 = 'some long\nfile with newlines'
c2 = 'some long\nfiles\twith no newlines'
tmp = dmp.diff_linesToChars(c2, c1, r'\s+');
mydiff = dmp.diff_main(tmp[0], tmp[1], False)
dmp.diff_charsToLines(mydiff, tmp[2]);
render_diff = dmp.diff_prettyHtml(mydiff)

API documentation of diff_linesToChars does not have optional third parameter, you can add it yourself by modifying corresponding function in diff_match_patch.py and providing a way to use regular expression pattern, not just a simple string. Default splitting remains at newlines.

import re

def diff_linesToChars(self, text1, text2, splitter=None):
    """Split two texts into an array of strings.  Reduce the texts to a string
    of hashes where each Unicode character represents one line.
    Args:
        text1: First string.
        text2: Second string.
    Returns:
        Three element tuple, containing the encoded text1, the encoded text2 and
        the array of unique strings.  The zeroth element of the array of unique
        strings is intentionally blank.
    """

    if splitter is None:
        pattern = re.compile(r'\n')
    else:
        pattern = re.compile(splitter)

    lineArray = []  # e.g. lineArray[4] == "Hello\n"
    lineHash = {}   # e.g. lineHash["Hello\n"] == 4

    # "\x00" is a valid character, but various debuggers don't like it.
    # So we'll insert a junk entry to avoid generating a null character.
    lineArray.append('')

    def diff_linesToCharsMunge(text, pattern):
        """Split a text into an array of strings.  Reduce the texts to a string
        of hashes where each Unicode character represents one line.
        Modifies linearray and linehash through being a closure.
  
        Args:
          text: String to encode.

        Returns:
          Encoded string.
        """
        chars = []
        # Walk the text, pulling out a substring for each line.
        # text.split('\n') would would temporarily double our memory footprint.
        # Modifying text would create many large strings to garbage collect.
        lineStart = 0
        lineEnd = -1
        while lineEnd < len(text) - 1:
            m = pattern.search(text, lineStart)
            if m:
                lineEnd = m.start()
            else:
                lineEnd = len(text) - 1
            line = text[lineStart:lineEnd + 1]
        	lineStart = lineEnd + 1

        if line in lineHash:
          	chars.append(unichr(lineHash[line]))
        else:
            lineArray.append(line)
            lineHash[line] = len(lineArray) - 1
            chars.append(unichr(len(lineArray) - 1))
      	return "".join(chars)

    chars1 = diff_linesToCharsMunge(text1, pattern)
    chars2 = diff_linesToCharsMunge(text2, pattern)
    return (chars1, chars2, lineArray)

Sign up for more like this.