Package logilab :: Package common :: Module textutils
[frames] | no frames]

Source Code for Module logilab.common.textutils

  1  # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-common. 
  5  # 
  6  # logilab-common is free software: you can redistribute it and/or modify it under 
  7  # the terms of the GNU Lesser General Public License as published by the Free 
  8  # Software Foundation, either version 2.1 of the License, or (at your option) any 
  9  # later version. 
 10  # 
 11  # logilab-common is distributed in the hope that it will be useful, but WITHOUT 
 12  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 13  # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more 
 14  # details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-common.  If not, see <http://www.gnu.org/licenses/>. 
 18  """Some text manipulation utility functions. 
 19   
 20   
 21  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
 22  unquote, colorize_ansi 
 23  :group text manipulation: searchall, splitstrip 
 24  :sort: text formatting, text manipulation 
 25   
 26  :type ANSI_STYLES: dict(str) 
 27  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 28   
 29  :type ANSI_COLORS: dict(str) 
 30  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 31   
 32  :type ANSI_PREFIX: str 
 33  :var ANSI_PREFIX: 
 34    ANSI terminal code notifying the start of an ANSI escape sequence 
 35   
 36  :type ANSI_END: str 
 37  :var ANSI_END: 
 38    ANSI terminal code notifying the end of an ANSI escape sequence 
 39   
 40  :type ANSI_RESET: str 
 41  :var ANSI_RESET: 
 42    ANSI terminal code resetting format defined by a previous ANSI escape sequence 
 43  """ 
 44  __docformat__ = "restructuredtext en" 
 45   
 46  import sys 
 47  import re 
 48  import os.path as osp 
 49  from warnings import warn 
 50  from unicodedata import normalize as _uninormalize 
 51  try: 
 52      from os import linesep 
 53  except ImportError: 
 54      linesep = '\n' # gae 
 55   
 56  from logilab.common.deprecation import deprecated 
 57   
 58  MANUAL_UNICODE_MAP = { 
 59      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 60      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 61      u'\u2044': u'/',  # FRACTION SLASH 
 62      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 63      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 64      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 65      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 66      u'\xae': u'(r)',  # REGISTERED SIGN 
 67      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 68      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 69      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 70      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 71      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 72      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 73      } 
 74   
75 -def unormalize(ustring, ignorenonascii=None, substitute=None):
76 """replace diacritical characters with their corresponding ascii characters 77 78 Convert the unicode string to its long normalized form (unicode character 79 will be transform into several characters) and keep the first one only. 80 The normal form KD (NFKD) will apply the compatibility decomposition, i.e. 81 replace all compatibility characters with their equivalents. 82 83 :type substitute: str 84 :param substitute: replacement character to use if decomposition fails 85 86 :see: Another project about ASCII transliterations of Unicode text 87 http://pypi.python.org/pypi/Unidecode 88 """ 89 # backward compatibility, ignorenonascii was a boolean 90 if ignorenonascii is not None: 91 warn("ignorenonascii is deprecated, use substitute named parameter instead", 92 DeprecationWarning, stacklevel=2) 93 if ignorenonascii: 94 substitute = '' 95 res = [] 96 for letter in ustring[:]: 97 try: 98 replacement = MANUAL_UNICODE_MAP[letter] 99 except KeyError: 100 replacement = _uninormalize('NFKD', letter)[0] 101 if ord(replacement) >= 2 ** 7: 102 if substitute is None: 103 raise ValueError("can't deal with non-ascii based characters") 104 replacement = substitute 105 res.append(replacement) 106 return u''.join(res)
107
108 -def unquote(string):
109 """remove optional quotes (simple or double) from the string 110 111 :type string: str or unicode 112 :param string: an optionally quoted string 113 114 :rtype: str or unicode 115 :return: the unquoted string (or the input string if it wasn't quoted) 116 """ 117 if not string: 118 return string 119 if string[0] in '"\'': 120 string = string[1:] 121 if string[-1] in '"\'': 122 string = string[:-1] 123 return string
124 125 126 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 127 _NORM_SPACES_RGX = re.compile('\s+') 128
129 -def normalize_text(text, line_len=80, indent='', rest=False):
130 """normalize a text to display it with a maximum line size and 131 optionally arbitrary indentation. Line jumps are normalized but blank 132 lines are kept. The indentation string may be used to insert a 133 comment (#) or a quoting (>) mark for instance. 134 135 :type text: str or unicode 136 :param text: the input text to normalize 137 138 :type line_len: int 139 :param line_len: expected maximum line's length, default to 80 140 141 :type indent: str or unicode 142 :param indent: optional string to use as indentation 143 144 :rtype: str or unicode 145 :return: 146 the input text normalized to fit on lines with a maximized size 147 inferior to `line_len`, and optionally prefixed by an 148 indentation string 149 """ 150 if rest: 151 normp = normalize_rest_paragraph 152 else: 153 normp = normalize_paragraph 154 result = [] 155 for text in _BLANKLINES_RGX.split(text): 156 result.append(normp(text, line_len, indent)) 157 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
158 159
160 -def normalize_paragraph(text, line_len=80, indent=''):
161 """normalize a text to display it with a maximum line size and 162 optionally arbitrary indentation. Line jumps are normalized. The 163 indentation string may be used top insert a comment mark for 164 instance. 165 166 :type text: str or unicode 167 :param text: the input text to normalize 168 169 :type line_len: int 170 :param line_len: expected maximum line's length, default to 80 171 172 :type indent: str or unicode 173 :param indent: optional string to use as indentation 174 175 :rtype: str or unicode 176 :return: 177 the input text normalized to fit on lines with a maximized size 178 inferior to `line_len`, and optionally prefixed by an 179 indentation string 180 """ 181 text = _NORM_SPACES_RGX.sub(' ', text) 182 line_len = line_len - len(indent) 183 lines = [] 184 while text: 185 aline, text = splittext(text.strip(), line_len) 186 lines.append(indent + aline) 187 return linesep.join(lines)
188
189 -def normalize_rest_paragraph(text, line_len=80, indent=''):
190 """normalize a ReST text to display it with a maximum line size and 191 optionally arbitrary indentation. Line jumps are normalized. The 192 indentation string may be used top insert a comment mark for 193 instance. 194 195 :type text: str or unicode 196 :param text: the input text to normalize 197 198 :type line_len: int 199 :param line_len: expected maximum line's length, default to 80 200 201 :type indent: str or unicode 202 :param indent: optional string to use as indentation 203 204 :rtype: str or unicode 205 :return: 206 the input text normalized to fit on lines with a maximized size 207 inferior to `line_len`, and optionally prefixed by an 208 indentation string 209 """ 210 toreport = '' 211 lines = [] 212 line_len = line_len - len(indent) 213 for line in text.splitlines(): 214 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 215 toreport = '' 216 while len(line) > line_len: 217 # too long line, need split 218 line, toreport = splittext(line, line_len) 219 lines.append(indent + line) 220 if toreport: 221 line = toreport + ' ' 222 toreport = '' 223 else: 224 line = '' 225 if line: 226 lines.append(indent + line.strip()) 227 return linesep.join(lines)
228 229
230 -def splittext(text, line_len):
231 """split the given text on space according to the given max line size 232 233 return a 2-uple: 234 * a line <= line_len if possible 235 * the rest of the text which has to be reported on another line 236 """ 237 if len(text) <= line_len: 238 return text, '' 239 pos = min(len(text)-1, line_len) 240 while pos > 0 and text[pos] != ' ': 241 pos -= 1 242 if pos == 0: 243 pos = min(len(text), line_len) 244 while len(text) > pos and text[pos] != ' ': 245 pos += 1 246 return text[:pos], text[pos+1:].strip()
247 248
249 -def splitstrip(string, sep=','):
250 """return a list of stripped string by splitting the string given as 251 argument on `sep` (',' by default). Empty string are discarded. 252 253 >>> splitstrip('a, b, c , 4,,') 254 ['a', 'b', 'c', '4'] 255 >>> splitstrip('a') 256 ['a'] 257 >>> 258 259 :type string: str or unicode 260 :param string: a csv line 261 262 :type sep: str or unicode 263 :param sep: field separator, default to the comma (',') 264 265 :rtype: str or unicode 266 :return: the unquoted string (or the input string if it wasn't quoted) 267 """ 268 return [word.strip() for word in string.split(sep) if word.strip()]
269 270 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip) 271 272
273 -def split_url_or_path(url_or_path):
274 """return the latest component of a string containing either an url of the 275 form <scheme>://<path> or a local file system path 276 """ 277 if '://' in url_or_path: 278 return url_or_path.rstrip('/').rsplit('/', 1) 279 return osp.split(url_or_path.rstrip(osp.sep))
280 281
282 -def text_to_dict(text):
283 """parse multilines text containing simple 'key=value' lines and return a 284 dict of {'key': 'value'}. When the same key is encountered multiple time, 285 value is turned into a list containing all values. 286 287 >>> text_to_dict('''multiple=1 288 ... multiple= 2 289 ... single =3 290 ... ''') 291 {'single': '3', 'multiple': ['1', '2']} 292 293 """ 294 res = {} 295 if not text: 296 return res 297 for line in text.splitlines(): 298 line = line.strip() 299 if line and not line.startswith('#'): 300 key, value = [w.strip() for w in line.split('=', 1)] 301 if key in res: 302 try: 303 res[key].append(value) 304 except AttributeError: 305 res[key] = [res[key], value] 306 else: 307 res[key] = value 308 return res
309 310 311 _BLANK_URE = r'(\s|,)+' 312 _BLANK_RE = re.compile(_BLANK_URE) 313 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 314 __UNITS_URE = r'[a-zA-Z]+' 315 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE)) 316 _VALIDATION_RE = re.compile(r'^((%s)(%s))*(%s)?$' % (__VALUE_URE, __UNITS_URE, 317 __VALUE_URE)) 318 319 BYTE_UNITS = { 320 "b": 1, 321 "kb": 1024, 322 "mb": 1024 ** 2, 323 "gb": 1024 ** 3, 324 "tb": 1024 ** 4, 325 } 326 327 TIME_UNITS = { 328 "ms": 0.0001, 329 "s": 1, 330 "min": 60, 331 "h": 60 * 60, 332 "d": 60 * 60 *24, 333 } 334
335 -def apply_units(string, units, inter=None, final=float, blank_reg=_BLANK_RE, 336 value_reg=_VALUE_RE):
337 """Parse the string applying the units defined in units 338 (e.g.: "1.5m",{'m',60} -> 80). 339 340 :type string: str or unicode 341 :param string: the string to parse 342 343 :type units: dict (or any object with __getitem__ using basestring key) 344 :param units: a dict mapping a unit string repr to its value 345 346 :type inter: type 347 :param inter: used to parse every intermediate value (need __sum__) 348 349 :type blank_reg: regexp 350 :param blank_reg: should match every blank char to ignore. 351 352 :type value_reg: regexp with "value" and optional "unit" group 353 :param value_reg: match a value and it's unit into the 354 """ 355 if inter is None: 356 inter = final 357 fstring = _BLANK_RE.sub('', string) 358 if not (fstring and _VALIDATION_RE.match(fstring)): 359 raise ValueError("Invalid unit string: %r." % string) 360 values = [] 361 for match in value_reg.finditer(fstring): 362 dic = match.groupdict() 363 lit, unit = dic["value"], dic.get("unit") 364 value = inter(lit) 365 if unit is not None: 366 try: 367 value *= units[unit.lower()] 368 except KeyError: 369 raise KeyError('invalid unit %s. valid units are %s' % 370 (unit, units.keys())) 371 values.append(value) 372 return final(sum(values))
373 374 375 _LINE_RGX = re.compile('\r\n|\r+|\n') 376
377 -def pretty_match(match, string, underline_char='^'):
378 """return a string with the match location underlined: 379 380 >>> import re 381 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')) 382 il mange du bacon 383 ^^^^^ 384 >>> 385 386 :type match: _sre.SRE_match 387 :param match: object returned by re.match, re.search or re.finditer 388 389 :type string: str or unicode 390 :param string: 391 the string on which the regular expression has been applied to 392 obtain the `match` object 393 394 :type underline_char: str or unicode 395 :param underline_char: 396 character to use to underline the matched section, default to the 397 carret '^' 398 399 :rtype: str or unicode 400 :return: 401 the original string with an inserted line to underline the match 402 location 403 """ 404 start = match.start() 405 end = match.end() 406 string = _LINE_RGX.sub(linesep, string) 407 start_line_pos = string.rfind(linesep, 0, start) 408 if start_line_pos == -1: 409 start_line_pos = 0 410 result = [] 411 else: 412 result = [string[:start_line_pos]] 413 start_line_pos += len(linesep) 414 offset = start - start_line_pos 415 underline = ' ' * offset + underline_char * (end - start) 416 end_line_pos = string.find(linesep, end) 417 if end_line_pos == -1: 418 string = string[start_line_pos:] 419 result.append(string) 420 result.append(underline) 421 else: 422 end = string[end_line_pos + len(linesep):] 423 string = string[start_line_pos:end_line_pos] 424 result.append(string) 425 result.append(underline) 426 result.append(end) 427 return linesep.join(result).rstrip()
428 429 430 # Ansi colorization ########################################################### 431 432 ANSI_PREFIX = '\033[' 433 ANSI_END = 'm' 434 ANSI_RESET = '\033[0m' 435 ANSI_STYLES = { 436 'reset': "0", 437 'bold': "1", 438 'italic': "3", 439 'underline': "4", 440 'blink': "5", 441 'inverse': "7", 442 'strike': "9", 443 } 444 ANSI_COLORS = { 445 'reset': "0", 446 'black': "30", 447 'red': "31", 448 'green': "32", 449 'yellow': "33", 450 'blue': "34", 451 'magenta': "35", 452 'cyan': "36", 453 'white': "37", 454 } 455
456 -def _get_ansi_code(color=None, style=None):
457 """return ansi escape code corresponding to color and style 458 459 :type color: str or None 460 :param color: 461 the color name (see `ANSI_COLORS` for available values) 462 or the color number when 256 colors are available 463 464 :type style: str or None 465 :param style: 466 style string (see `ANSI_COLORS` for available values). To get 467 several style effects at the same time, use a coma as separator. 468 469 :raise KeyError: if an unexistent color or style identifier is given 470 471 :rtype: str 472 :return: the built escape code 473 """ 474 ansi_code = [] 475 if style: 476 style_attrs = splitstrip(style) 477 for effect in style_attrs: 478 ansi_code.append(ANSI_STYLES[effect]) 479 if color: 480 if color.isdigit(): 481 ansi_code.extend(['38', '5']) 482 ansi_code.append(color) 483 else: 484 ansi_code.append(ANSI_COLORS[color]) 485 if ansi_code: 486 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 487 return ''
488
489 -def colorize_ansi(msg, color=None, style=None):
490 """colorize message by wrapping it with ansi escape codes 491 492 :type msg: str or unicode 493 :param msg: the message string to colorize 494 495 :type color: str or None 496 :param color: 497 the color identifier (see `ANSI_COLORS` for available values) 498 499 :type style: str or None 500 :param style: 501 style string (see `ANSI_COLORS` for available values). To get 502 several style effects at the same time, use a coma as separator. 503 504 :raise KeyError: if an unexistent color or style identifier is given 505 506 :rtype: str or unicode 507 :return: the ansi escaped string 508 """ 509 # If both color and style are not defined, then leave the text as is 510 if color is None and style is None: 511 return msg 512 escape_code = _get_ansi_code(color, style) 513 # If invalid (or unknown) color, don't wrap msg with ansi codes 514 if escape_code: 515 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 516 return msg
517 518 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} 519
520 -def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
521 for line in lines: 522 if line[:4] in ('--- ', '+++ '): 523 out.write(colorize_ansi(line, style['separator'])) 524 elif line[0] == '-': 525 out.write(colorize_ansi(line, style['remove'])) 526 elif line[0] == '+': 527 out.write(colorize_ansi(line, style['add'])) 528 elif line[:4] == '--- ': 529 out.write(colorize_ansi(line, style['separator'])) 530 elif line[:4] == '+++ ': 531 out.write(colorize_ansi(line, style['separator'])) 532 else: 533 out.write(line)
534