Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

textwrap.py @ 40094

Last change on this file since 40094 was 40094, checked in by obi, 7 years ago
tithek add yoztube-dl support
File size: 16.8 KB

Line
1	"""Text wrapping and filling.
2	"""
3
4	# Copyright (C) 1999-2001 Gregory P. Ward.
5	# Copyright (C) 2002, 2003 Python Software Foundation.
6	# Written by Greg Ward <gward@python.net>
7
8	__revision__ = "$Id$"
9
10	import string, re
11
12	try:
13	_unicode = unicode
14	except NameError:
15	# If Python is built without Unicode support, the unicode type
16	# will not exist. Fake one.
17	class _unicode(object):
18	pass
19
20	# Do the right thing with boolean values for all known Python versions
21	# (so this module can be copied to projects that don't depend on Python
22	# 2.3, e.g. Optik and Docutils) by uncommenting the block of code below.
23	#try:
24	# True, False
25	#except NameError:
26	# (True, False) = (1, 0)
27
28	__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent']
29
30	# Hardcode the recognized whitespace characters to the US-ASCII
31	# whitespace characters. The main reason for doing this is that in
32	# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
33	# that character winds up in string.whitespace. Respecting
34	# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
35	# same as any other whitespace char, which is clearly wrong (it's a
36	# non-breaking space), 2) possibly cause problems with Unicode,
37	# since 0xa0 is not in range(128).
38	_whitespace = '\t\n\x0b\x0c\r '
39
40	class TextWrapper:
41	"""
42	Object for wrapping/filling text. The public interface consists of
43	the wrap() and fill() methods; the other methods are just there for
44	subclasses to override in order to tweak the default behaviour.
45	If you want to completely replace the main wrapping algorithm,
46	you'll probably have to override _wrap_chunks().
47
48	Several instance attributes control various aspects of wrapping:
49	width (default: 70)
50	the maximum width of wrapped lines (unless break_long_words
51	is false)
52	initial_indent (default: "")
53	string that will be prepended to the first line of wrapped
54	output. Counts towards the line's width.
55	subsequent_indent (default: "")
56	string that will be prepended to all lines save the first
57	of wrapped output; also counts towards each line's width.
58	expand_tabs (default: true)
59	Expand tabs in input text to spaces before further processing.
60	Each tab will become 1 .. 8 spaces, depending on its position in
61	its line. If false, each tab is treated as a single character.
62	replace_whitespace (default: true)
63	Replace all whitespace characters in the input text by spaces
64	after tab expansion. Note that if expand_tabs is false and
65	replace_whitespace is true, every tab will be converted to a
66	single space!
67	fix_sentence_endings (default: false)
68	Ensure that sentence-ending punctuation is always followed
69	by two spaces. Off by default because the algorithm is
70	(unavoidably) imperfect.
71	break_long_words (default: true)
72	Break words longer than 'width'. If false, those words will not
73	be broken, and some lines might be longer than 'width'.
74	break_on_hyphens (default: true)
75	Allow breaking hyphenated words. If true, wrapping will occur
76	preferably on whitespaces and right after hyphens part of
77	compound words.
78	drop_whitespace (default: true)
79	Drop leading and trailing whitespace from lines.
80	"""
81
82	whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
83
84	unicode_whitespace_trans = {}
85	uspace = ord(u' ')
86	for x in map(ord, _whitespace):
87	unicode_whitespace_trans[x] = uspace
88
89	# This funky little regex is just the trick for splitting
90	# text up into word-wrappable chunks. E.g.
91	# "Hello there -- you goof-ball, use the -b option!"
92	# splits into
93	# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
94	# (after stripping out empty strings).
95	wordsep_re = re.compile(
96	r'(\s+\|' # any whitespace
97	r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])\|' # hyphenated words
98	r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
99
100	# This less funky little regex just split on recognized spaces. E.g.
101	# "Hello there -- you goof-ball, use the -b option!"
102	# splits into
103	# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
104	wordsep_simple_re = re.compile(r'(\s+)')
105
106	# XXX this is not locale- or charset-aware -- string.lowercase
107	# is US-ASCII only (and therefore English-only)
108	sentence_end_re = re.compile(r'[%s]' # lowercase letter
109	r'[\.\!\?]' # sentence-ending punct.
110	r'[\"\']?' # optional end-of-quote
111	r'\Z' # end of chunk
112	% string.lowercase)
113
114
115	def __init__(self,
116	width=70,
117	initial_indent="",
118	subsequent_indent="",
119	expand_tabs=True,
120	replace_whitespace=True,
121	fix_sentence_endings=False,
122	break_long_words=True,
123	drop_whitespace=True,
124	break_on_hyphens=True):
125	self.width = width
126	self.initial_indent = initial_indent
127	self.subsequent_indent = subsequent_indent
128	self.expand_tabs = expand_tabs
129	self.replace_whitespace = replace_whitespace
130	self.fix_sentence_endings = fix_sentence_endings
131	self.break_long_words = break_long_words
132	self.drop_whitespace = drop_whitespace
133	self.break_on_hyphens = break_on_hyphens
134
135	# recompile the regexes for Unicode mode -- done in this clumsy way for
136	# backwards compatibility because it's rather common to monkey-patch
137	# the TextWrapper class' wordsep_re attribute.
138	self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
139	self.wordsep_simple_re_uni = re.compile(
140	self.wordsep_simple_re.pattern, re.U)
141
142
143	# -- Private methods -----------------------------------------------
144	# (possibly useful for subclasses to override)
145
146	def _munge_whitespace(self, text):
147	"""_munge_whitespace(text : string) -> string
148
149	Munge whitespace in text: expand tabs and convert all other
150	whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz"
151	becomes " foo bar baz".
152	"""
153	if self.expand_tabs:
154	text = text.expandtabs()
155	if self.replace_whitespace:
156	if isinstance(text, str):
157	text = text.translate(self.whitespace_trans)
158	elif isinstance(text, _unicode):
159	text = text.translate(self.unicode_whitespace_trans)
160	return text
161
162
163	def _split(self, text):
164	"""_split(text : string) -> [string]
165
166	Split the text to wrap into indivisible chunks. Chunks are
167	not quite the same as words; see _wrap_chunks() for full
168	details. As an example, the text
169	Look, goof-ball -- use the -b option!
170	breaks into the following chunks:
171	'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
172	'use', ' ', 'the', ' ', '-b', ' ', 'option!'
173	if break_on_hyphens is True, or in:
174	'Look,', ' ', 'goof-ball', ' ', '--', ' ',
175	'use', ' ', 'the', ' ', '-b', ' ', option!'
176	otherwise.
177	"""
178	if isinstance(text, _unicode):
179	if self.break_on_hyphens:
180	pat = self.wordsep_re_uni
181	else:
182	pat = self.wordsep_simple_re_uni
183	else:
184	if self.break_on_hyphens:
185	pat = self.wordsep_re
186	else:
187	pat = self.wordsep_simple_re
188	chunks = pat.split(text)
189	chunks = filter(None, chunks) # remove empty chunks
190	return chunks
191
192	def _fix_sentence_endings(self, chunks):
193	"""_fix_sentence_endings(chunks : [string])
194
195	Correct for sentence endings buried in 'chunks'. Eg. when the
196	original text contains "... foo.\\nBar ...", munge_whitespace()
197	and split() will convert that to [..., "foo.", " ", "Bar", ...]
198	which has one too few spaces; this method simply changes the one
199	space to two.
200	"""
201	i = 0
202	patsearch = self.sentence_end_re.search
203	while i < len(chunks)-1:
204	if chunks[i+1] == " " and patsearch(chunks[i]):
205	chunks[i+1] = " "
206	i += 2
207	else:
208	i += 1
209
210	def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
211	"""_handle_long_word(chunks : [string],
212	cur_line : [string],
213	cur_len : int, width : int)
214
215	Handle a chunk of text (most likely a word, not whitespace) that
216	is too long to fit in any line.
217	"""
218	# Figure out when indent is larger than the specified width, and make
219	# sure at least one character is stripped off on every pass
220	if width < 1:
221	space_left = 1
222	else:
223	space_left = width - cur_len
224
225	# If we're allowed to break long words, then do so: put as much
226	# of the next chunk onto the current line as will fit.
227	if self.break_long_words:
228	cur_line.append(reversed_chunks[-1][:space_left])
229	reversed_chunks[-1] = reversed_chunks[-1][space_left:]
230
231	# Otherwise, we have to preserve the long word intact. Only add
232	# it to the current line if there's nothing already there --
233	# that minimizes how much we violate the width constraint.
234	elif not cur_line:
235	cur_line.append(reversed_chunks.pop())
236
237	# If we're not allowed to break long words, and there's already
238	# text on the current line, do nothing. Next time through the
239	# main loop of _wrap_chunks(), we'll wind up here again, but
240	# cur_len will be zero, so the next line will be entirely
241	# devoted to the long word that we can't handle right now.
242
243	def _wrap_chunks(self, chunks):
244	"""_wrap_chunks(chunks : [string]) -> [string]
245
246	Wrap a sequence of text chunks and return a list of lines of
247	length 'self.width' or less. (If 'break_long_words' is false,
248	some lines may be longer than this.) Chunks correspond roughly
249	to words and the whitespace between them: each chunk is
250	indivisible (modulo 'break_long_words'), but a line break can
251	come between any two chunks. Chunks should not have internal
252	whitespace; ie. a chunk is either all whitespace or a "word".
253	Whitespace chunks will be removed from the beginning and end of
254	lines, but apart from that whitespace is preserved.
255	"""
256	lines = []
257	if self.width <= 0:
258	raise ValueError("invalid width %r (must be > 0)" % self.width)
259
260	# Arrange in reverse order so items can be efficiently popped
261	# from a stack of chucks.
262	chunks.reverse()
263
264	while chunks:
265
266	# Start the list of chunks that will make up the current line.
267	# cur_len is just the length of all the chunks in cur_line.
268	cur_line = []
269	cur_len = 0
270
271	# Figure out which static string will prefix this line.
272	if lines:
273	indent = self.subsequent_indent
274	else:
275	indent = self.initial_indent
276
277	# Maximum width for this line.
278	width = self.width - len(indent)
279
280	# First chunk on line is whitespace -- drop it, unless this
281	# is the very beginning of the text (ie. no lines started yet).
282	if self.drop_whitespace and chunks[-1].strip() == '' and lines:
283	del chunks[-1]
284
285	while chunks:
286	l = len(chunks[-1])
287
288	# Can at least squeeze this chunk onto the current line.
289	if cur_len + l <= width:
290	cur_line.append(chunks.pop())
291	cur_len += l
292
293	# Nope, this line is full.
294	else:
295	break
296
297	# The current line is full, and the next chunk is too big to
298	# fit on any line (not just this one).
299	if chunks and len(chunks[-1]) > width:
300	self._handle_long_word(chunks, cur_line, cur_len, width)
301
302	# If the last chunk on this line is all whitespace, drop it.
303	if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
304	del cur_line[-1]
305
306	# Convert current line back to a string and store it in list
307	# of all lines (return value).
308	if cur_line:
309	lines.append(indent + ''.join(cur_line))
310
311	return lines
312
313
314	# -- Public interface ----------------------------------------------
315
316	def wrap(self, text):
317	"""wrap(text : string) -> [string]
318
319	Reformat the single paragraph in 'text' so it fits in lines of
320	no more than 'self.width' columns, and return a list of wrapped
321	lines. Tabs in 'text' are expanded with string.expandtabs(),
322	and all other whitespace characters (including newline) are
323	converted to space.
324	"""
325	text = self._munge_whitespace(text)
326	chunks = self._split(text)
327	if self.fix_sentence_endings:
328	self._fix_sentence_endings(chunks)
329	return self._wrap_chunks(chunks)
330
331	def fill(self, text):
332	"""fill(text : string) -> string
333
334	Reformat the single paragraph in 'text' to fit in lines of no
335	more than 'self.width' columns, and return a new string
336	containing the entire wrapped paragraph.
337	"""
338	return "\n".join(self.wrap(text))
339
340
341	# -- Convenience interface ---------------------------------------------
342
343	def wrap(text, width=70, **kwargs):
344	"""Wrap a single paragraph of text, returning a list of wrapped lines.
345
346	Reformat the single paragraph in 'text' so it fits in lines of no
347	more than 'width' columns, and return a list of wrapped lines. By
348	default, tabs in 'text' are expanded with string.expandtabs(), and
349	all other whitespace characters (including newline) are converted to
350	space. See TextWrapper class for available keyword args to customize
351	wrapping behaviour.
352	"""
353	w = TextWrapper(width=width, **kwargs)
354	return w.wrap(text)
355
356	def fill(text, width=70, **kwargs):
357	"""Fill a single paragraph of text, returning a new string.
358
359	Reformat the single paragraph in 'text' to fit in lines of no more
360	than 'width' columns, and return a new string containing the entire
361	wrapped paragraph. As with wrap(), tabs are expanded and other
362	whitespace characters converted to space. See TextWrapper class for
363	available keyword args to customize wrapping behaviour.
364	"""
365	w = TextWrapper(width=width, **kwargs)
366	return w.fill(text)
367
368
369	# -- Loosely related functionality -------------------------------------
370
371	_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
372	_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
373
374	def dedent(text):
375	"""Remove any common leading whitespace from every line in `text`.
376
377	This can be used to make triple-quoted strings line up with the left
378	edge of the display, while still presenting them in the source code
379	in indented form.
380
381	Note that tabs and spaces are both treated as whitespace, but they
382	are not equal: the lines " hello" and "\\thello" are
383	considered to have no common leading whitespace. (This behaviour is
384	new in Python 2.5; older versions of this module incorrectly
385	expanded tabs before searching for common leading whitespace.)
386	"""
387	# Look for the longest leading string of spaces and tabs common to
388	# all lines.
389	margin = None
390	text = _whitespace_only_re.sub('', text)
391	indents = _leading_whitespace_re.findall(text)
392	for indent in indents:
393	if margin is None:
394	margin = indent
395
396	# Current line more deeply indented than previous winner:
397	# no change (previous winner is still on top).
398	elif indent.startswith(margin):
399	pass
400
401	# Current line consistent with and no deeper than previous winner:
402	# it's the new winner.
403	elif margin.startswith(indent):
404	margin = indent
405
406	# Find the largest common whitespace between current line and previous
407	# winner.
408	else:
409	for i, (x, y) in enumerate(zip(margin, indent)):
410	if x != y:
411	margin = margin[:i]
412	break
413	else:
414	margin = margin[:len(indent)]
415
416	# sanity check (testing/debugging only)
417	if 0 and margin:
418	for line in text.split("\n"):
419	assert not line or line.startswith(margin), \
420	"line = %r, margin = %r" % (line, margin)
421
422	if margin:
423	text = re.sub(r'(?m)^' + margin, '', text)
424	return text
425
426	if __name__ == "__main__":
427	#print dedent("\tfoo\n\tbar")
428	#print dedent(" \thello there\n \t how are you?")
429	print dedent("Hello there.\n This is indented.")

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: titan/mediathek/localhoster/lib/python2.7/textwrap.py @ 40094

Download in other formats: