# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
# (after stripping out empty strings).
wordsep_re = re.compile(r'(\s+|' # any whitespace
- r'-*\w{2,}-(?=\w{2,})|' # hyphenated words
+ r'[^\s\w]*\w{2,}-(?=\w{2,})|' # hyphenated words
r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
- # XXX will there be a locale-or-charset-aware version of
- # string.lowercase in 2.3?
+ # XXX this is not locale- or charset-aware -- string.lowercase
+ # is US-ASCII only (and therefore English-only)
sentence_end_re = re.compile(r'[%s]' # lowercase letter
r'[\.\!\?]' # sentence-ending punct.
r'[\"\']?' # optional end-of-quote