Package translate :: Package storage :: Module pypo
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.pypo

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2007 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of .po files (pounit) or entire files (pofile) 
 23  gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)""" 
 24   
 25  from __future__ import generators 
 26  from translate.misc.multistring import multistring 
 27  from translate.misc import quote 
 28  from translate.misc import textwrap 
 29  from translate.lang import data 
 30  from translate.storage import pocommon 
 31  import re 
 32   
 33  lsep = "\n#: " 
 34  """Seperator for #: entries""" 
 35   
 36  # general functions for quoting / unquoting po strings 
 37   
 38  po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'} 
 39  po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()]) 
 40   
41 -def escapeforpo(line):
42 """Escapes a line for po format. assumes no \n occurs in the line. 43 44 @param line: unescaped text 45 """ 46 special_locations = [] 47 for special_key in po_escape_map: 48 special_locations.extend(quote.find_all(line, special_key)) 49 special_locations = dict.fromkeys(special_locations).keys() 50 special_locations.sort() 51 escaped_line = "" 52 last_location = 0 53 for location in special_locations: 54 escaped_line += line[last_location:location] 55 escaped_line += po_escape_map[line[location:location+1]] 56 last_location = location+1 57 escaped_line += line[last_location:] 58 return escaped_line
59
60 -def unescapehandler(escape):
61 62 return po_unescape_map.get(escape, escape)
63
64 -def wrapline(line):
65 """Wrap text for po files.""" 66 wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False) 67 68 # Lines should not start with a space... 69 if len(wrappedlines) > 1: 70 for index, line in enumerate(wrappedlines[1:]): 71 if line.startswith(' '): 72 # Remove the space at the beginning of the line: 73 wrappedlines[index+1] = line[1:] 74 75 # Append a space to the previous line: 76 wrappedlines[index] += ' ' 77 return wrappedlines
78
79 -def quoteforpo(text):
80 """quotes the given text for a PO file, returning quoted and escaped lines""" 81 polines = [] 82 if text is None: 83 return polines 84 lines = text.split("\n") 85 if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71): 86 if len(lines) != 2 or lines[1]: 87 polines.extend(['""']) 88 for line in lines[:-1]: 89 lns = wrapline(line) 90 if len(lns) > 0: 91 for ln in lns[:-1]: 92 polines.extend(['"' + escapeforpo(ln) + '"']) 93 if lns[-1]: 94 polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"']) 95 else: 96 polines.extend(['"\\n"']) 97 if lines[-1]: 98 polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])]) 99 return polines
100
101 -def extractpoline(line):
102 """Remove quote and unescape line from po file. 103 104 @param line: a quoted line from a po file (msgid or msgstr) 105 """ 106 extracted = quote.extractwithoutquotes(line,'"','"','\\',includeescapes=unescapehandler)[0] 107 return extracted
108
109 -def unquotefrompo(postr, joinwithlinebreak=False):
110 if joinwithlinebreak: 111 joiner = u"\n" 112 if postr and postr[0] == '""': postr = postr[1:] 113 else: 114 joiner = u"" 115 return joiner.join([extractpoline(line) for line in postr])
116
117 -def encodingToUse(encoding):
118 """Tests whether the given encoding is known in the python runtime, or returns utf-8. 119 This function is used to ensure that a valid encoding is always used.""" 120 if encoding == "CHARSET" or encoding == None: return 'utf-8' 121 return encoding
122 # if encoding is None: return False 123 # return True 124 # try: 125 # tuple = codecs.lookup(encoding) 126 # except LookupError: 127 # return False 128 # return True 129 130 """ 131 From the GNU gettext manual: 132 WHITE-SPACE 133 # TRANSLATOR-COMMENTS 134 #. AUTOMATIC-COMMENTS 135 #| PREVIOUS MSGID (Gettext 0.16 - check if this is the correct position - not yet implemented) 136 #: REFERENCE... 137 #, FLAG... 138 msgctxt CONTEXT (Gettext 0.15) 139 msgid UNTRANSLATED-STRING 140 msgstr TRANSLATED-STRING 141 """ 142
143 -class pounit(pocommon.pounit):
144 # othercomments = [] # # this is another comment 145 # automaticcomments = [] # #. comment extracted from the source code 146 # sourcecomments = [] # #: sourcefile.xxx:35 147 # typecomments = [] # #, fuzzy 148 # msgidcomments = [] # _: within msgid 149 # msgctxt 150 # msgid = [] 151 # msgstr = [] 152
153 - def __init__(self, source=None, encoding="UTF-8"):
154 self._encoding = encodingToUse(encoding) 155 self.obsolete = False 156 self._initallcomments(blankall=True) 157 self.msgctxt = [] 158 self.msgid = [] 159 self.msgid_pluralcomments = [] 160 self.msgid_plural = [] 161 self.msgstr = [] 162 self.obsoletemsgctxt = [] 163 self.obsoletemsgid = [] 164 self.obsoletemsgid_pluralcomments = [] 165 self.obsoletemsgid_plural = [] 166 self.obsoletemsgstr = [] 167 if source: 168 self.setsource(source) 169 super(pounit, self).__init__(source)
170
171 - def _initallcomments(self, blankall=False):
172 """Initialises allcomments""" 173 if blankall: 174 self.othercomments = [] 175 self.automaticcomments = [] 176 self.sourcecomments = [] 177 self.typecomments = [] 178 self.msgidcomments = [] 179 self.obsoletemsgidcomments = [] 180 self.allcomments = [self.othercomments, 181 self.automaticcomments, 182 self.sourcecomments, 183 self.typecomments, 184 self.msgidcomments, 185 self.obsoletemsgidcomments]
186
187 - def getsource(self):
188 """Returns the unescaped msgid""" 189 multi = multistring(unquotefrompo(self.msgid), self._encoding) 190 if self.hasplural(): 191 pluralform = unquotefrompo(self.msgid_plural) 192 if isinstance(pluralform, str): 193 pluralform = pluralform.decode(self._encoding) 194 multi.strings.append(pluralform) 195 return multi
196
197 - def setsource(self, source):
198 """Sets the msgid to the given (unescaped) value. 199 200 @param source: an unescaped source string. 201 """ 202 if isinstance(source, str): 203 source = source.decode(self._encoding) 204 if isinstance(source, multistring): 205 source = source.strings 206 if isinstance(source, list): 207 self.msgid = quoteforpo(source[0]) 208 if len(source) > 1: 209 self.msgid_plural = quoteforpo(source[1]) 210 else: 211 self.msgid = quoteforpo(source)
212 source = property(getsource, setsource) 213
214 - def gettarget(self):
215 """Returns the unescaped msgstr""" 216 if isinstance(self.msgstr, dict): 217 multi = multistring(map(unquotefrompo, self.msgstr.values()), self._encoding) 218 else: 219 multi = multistring(unquotefrompo(self.msgstr), self._encoding) 220 return multi
221
222 - def settarget(self, target):
223 """Sets the msgstr to the given (unescaped) value""" 224 if isinstance(target, str): 225 target = target.decode(self._encoding) 226 if target == self.target: 227 return 228 if self.hasplural(): 229 if isinstance(target, multistring): 230 target = target.strings 231 elif isinstance(target, basestring): 232 target = [target] 233 elif isinstance(target,(dict, list)): 234 if len(target) == 1: 235 target = target[0] 236 else: 237 raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target)) 238 templates = self.msgstr 239 if isinstance(templates, list): 240 templates = {0: templates} 241 if isinstance(target, list): 242 self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))]) 243 elif isinstance(target, dict): 244 self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()]) 245 else: 246 self.msgstr = quoteforpo(target)
247 target = property(gettarget, settarget) 248
249 - def getnotes(self, origin=None):
250 """Return comments based on origin value (programmer, developer, source code and translator)""" 251 if origin == None: 252 comments = u"".join([comment[2:] for comment in self.othercomments]) 253 comments += u"".join([comment[3:] for comment in self.automaticcomments]) 254 elif origin == "translator": 255 comments = u"".join ([comment[2:] for comment in self.othercomments]) 256 elif origin in ["programmer", "developer", "source code"]: 257 comments = u"".join([comment[3:] for comment in self.automaticcomments]) 258 else: 259 raise ValueError("Comment type not valid") 260 # Let's drop the last newline 261 return comments[:-1]
262
263 - def addnote(self, text, origin=None, position="append"):
264 """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote""" 265 # We don't want to put in an empty '#' without a real comment: 266 if not text: 267 return 268 text = data.forceunicode(text) 269 commentlist = self.othercomments 270 linestart = "# " 271 if origin in ["programmer", "developer", "source code"]: 272 autocomments = True 273 commentlist = self.automaticcomments 274 linestart = "#. " 275 text = text.split("\n") 276 if position == "append": 277 commentlist += [linestart + line + "\n" for line in text] 278 else: 279 newcomments = [linestart + line + "\n" for line in text] 280 newcomments += [line for line in commentlist] 281 if autocomments: 282 self.automaticcomments = newcomments 283 else: 284 self.othercomments = newcomments
285
286 - def removenotes(self):
287 """Remove all the translator's notes (other comments)""" 288 self.othercomments = []
289
290 - def copy(self):
291 newpo = self.__class__() 292 newpo.othercomments = self.othercomments[:] 293 newpo.automaticcomments = self.automaticcomments[:] 294 newpo.sourcecomments = self.sourcecomments[:] 295 newpo.typecomments = self.typecomments[:] 296 newpo.obsolete = self.obsolete 297 newpo.msgidcomments = self.msgidcomments[:] 298 newpo._initallcomments() 299 newpo.msgctxt = self.msgctxt[:] 300 newpo.msgid = self.msgid[:] 301 newpo.msgid_pluralcomments = self.msgid_pluralcomments[:] 302 newpo.msgid_plural = self.msgid_plural[:] 303 if isinstance(self.msgstr, dict): 304 newpo.msgstr = self.msgstr.copy() 305 else: 306 newpo.msgstr = self.msgstr[:] 307 308 newpo.obsoletemsgctxt = self.obsoletemsgctxt[:] 309 newpo.obsoletemsgid = self.obsoletemsgid[:] 310 newpo.obsoletemsgid_pluralcomments = self.obsoletemsgid_pluralcomments[:] 311 newpo.obsoletemsgid_plural = self.obsoletemsgid_plural[:] 312 if isinstance(self.obsoletemsgstr, dict): 313 newpo.obsoletemsgstr = self.obsoletemsgstr.copy() 314 else: 315 newpo.obsoletemsgstr = self.obsoletemsgstr[:] 316 return newpo
317
318 - def msgidlen(self):
319 if self.hasplural(): 320 return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip()) 321 else: 322 return len(unquotefrompo(self.msgid).strip())
323
324 - def msgstrlen(self):
325 if isinstance(self.msgstr, dict): 326 combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()]) 327 return len(combinedstr.strip()) 328 else: 329 return len(unquotefrompo(self.msgstr).strip())
330
331 - def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
332 """Merges the otherpo (with the same msgid) into this one. 333 334 Overwrite non-blank self.msgstr only if overwrite is True 335 merge comments only if comments is True 336 337 """ 338 339 def mergelists(list1, list2, split=False): 340 #decode where necessary 341 if unicode in [type(item) for item in list2] + [type(item) for item in list1]: 342 for position, item in enumerate(list1): 343 if isinstance(item, str): 344 list1[position] = item.decode("utf-8") 345 for position, item in enumerate(list2): 346 if isinstance(item, str): 347 list2[position] = item.decode("utf-8") 348 349 #Determine the newline style of list1 350 lineend = "" 351 if list1 and list1[0]: 352 for candidate in ["\n", "\r", "\n\r"]: 353 if list1[0].endswith(candidate): 354 lineend = candidate 355 if not lineend: 356 lineend = "" 357 else: 358 lineend = "\n" 359 360 #Split if directed to do so: 361 if split: 362 splitlist1 = [] 363 splitlist2 = [] 364 prefix = "#" 365 for item in list1: 366 splitlist1.extend(item.split()[1:]) 367 prefix = item.split()[0] 368 for item in list2: 369 splitlist2.extend(item.split()[1:]) 370 prefix = item.split()[0] 371 list1.extend(["%s %s%s" % (prefix,item,lineend) for item in splitlist2 if not item in splitlist1]) 372 else: 373 #Normal merge, but conform to list1 newline style 374 if list1 != list2: 375 for item in list2: 376 if lineend: 377 item = item.rstrip() + lineend 378 # avoid duplicate comment lines (this might cause some problems) 379 if item not in list1 or len(item) < 5: 380 list1.append(item)
381 if not isinstance(otherpo, pounit): 382 super(pounit, self).merge(otherpo, overwrite, comments) 383 return 384 if comments: 385 mergelists(self.othercomments, otherpo.othercomments) 386 mergelists(self.typecomments, otherpo.typecomments) 387 if not authoritative: 388 # We don't bring across otherpo.automaticcomments as we consider ourself 389 # to be the the authority. Same applies to otherpo.msgidcomments 390 mergelists(self.automaticcomments, otherpo.automaticcomments) 391 mergelists(self.msgidcomments, otherpo.msgidcomments) 392 mergelists(self.sourcecomments, otherpo.sourcecomments, split=True) 393 if not self.istranslated() or overwrite: 394 # Remove kde-style comments from the translation (if any). 395 if self._extract_msgidcomments(otherpo.target): 396 otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '') 397 self.target = otherpo.target 398 if self.source != otherpo.source: 399 self.markfuzzy() 400 else: 401 self.markfuzzy(otherpo.isfuzzy()) 402 elif not otherpo.istranslated(): 403 if self.source != otherpo.source: 404 self.markfuzzy() 405 else: 406 if self.target != otherpo.target: 407 self.markfuzzy()
408
409 - def isheader(self):
410 #return (self.msgidlen() == 0) and (self.msgstrlen() > 0) and (len(self.msgidcomments) == 0) 411 #rewritten here for performance: 412 return ((self.msgid == [] or self.msgid == ['""']) and 413 not (self.msgstr == [] or self.msgstr == ['""']) 414 and self.msgidcomments == [] 415 and (self.msgctxt == [] or self.msgctxt == ['""']) 416 and (self.sourcecomments == [] or self.sourcecomments == [""]))
417
418 - def isblank(self):
419 if self.isheader() or len(self.msgidcomments): 420 return False 421 if (self.msgidlen() == 0) and (self.msgstrlen() == 0): 422 return True 423 return False
424 # TODO: remove: 425 # Before, the equivalent of the following was the final return statement: 426 # return len(self.source.strip()) == 0 427
428 - def hastypecomment(self, typecomment):
429 """check whether the given type comment is present""" 430 # check for word boundaries properly by using a regular expression... 431 return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0
432
433 - def hasmarkedcomment(self, commentmarker):
434 """check whether the given comment marker is present as # (commentmarker) ...""" 435 commentmarker = "(%s)" % commentmarker 436 for comment in self.othercomments: 437 if comment.replace("#", "", 1).strip().startswith(commentmarker): 438 return True 439 return False
440
441 - def settypecomment(self, typecomment, present=True):
442 """alters whether a given typecomment is present""" 443 if self.hastypecomment(typecomment) != present: 444 if present: 445 self.typecomments.append("#, %s\n" % typecomment) 446 else: 447 # this should handle word boundaries properly ... 448 typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments) 449 self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)
450
451 - def istranslated(self):
452 return super(pounit, self).istranslated() and not self.isobsolete()
453
454 - def istranslatable(self):
455 return not (self.isheader() or self.isblank())
456
457 - def isfuzzy(self):
458 return self.hastypecomment("fuzzy")
459
460 - def markfuzzy(self, present=True):
461 self.settypecomment("fuzzy", present)
462
463 - def isreview(self):
464 return self.hastypecomment("review") or self.hasmarkedcomment("review") or self.hasmarkedcomment("pofilter")
465
466 - def isobsolete(self):
467 return self.obsolete
468
469 - def makeobsolete(self):
470 """Makes this unit obsolete""" 471 self.obsolete = True 472 if self.msgctxt: 473 self.obsoletemsgctxt = self.msgctxt 474 if self.msgid: 475 self.obsoletemsgid = self.msgid 476 self.msgid = [] 477 if self.msgidcomments: 478 self.obsoletemsgidcomments = self.msgidcomments 479 self.msgidcomments = [] 480 if self.msgid_plural: 481 self.obsoletemsgid_plural = self.msgid_plural 482 self.msgid_plural = [] 483 if self.msgstr: 484 self.obsoletemsgstr = self.msgstr 485 self.msgstr = [] 486 self.sourcecomments = [] 487 self.automaticcomments = []
488
489 - def resurrect(self):
490 """Makes an obsolete unit normal""" 491 self.obsolete = False 492 if self.obsoletemsgctxt: 493 self.msgid = self.obsoletemsgctxt 494 self.obsoletemsgctxt = [] 495 if self.obsoletemsgid: 496 self.msgid = self.obsoletemsgid 497 self.obsoletemsgid = [] 498 if self.obsoletemsgidcomments: 499 self.msgidcomments = self.obsoletemsgidcomments 500 self.obsoletemsgidcomments = [] 501 if self.obsoletemsgid_plural: 502 self.msgid_plural = self.obsoletemsgid_plural 503 self.obsoletemsgid_plural = [] 504 if self.obsoletemsgstr: 505 self.msgstr = self.obsoletemsgstr 506 self.obsoletemgstr = []
507
508 - def hasplural(self):
509 """returns whether this pounit contains plural strings...""" 510 return len(self.msgid_plural) > 0
511
512 - def parse(self, src):
513 if isinstance(src, str): 514 # This has not been decoded yet, so we need to make a plan 515 src = src.decode(self._encoding) 516 inmsgctxt = 0 517 inmsgid = 0 518 inmsgid_comment = 0 519 inmsgid_plural = 0 520 inmsgstr = 0 521 msgstr_pluralid = None 522 linesprocessed = 0 523 for line in src.split("\n"): 524 line = line + "\n" 525 linesprocessed += 1 526 if len(line) == 0: 527 continue 528 elif line[0] == '#': 529 if inmsgstr and not line[1] == '~': 530 # if we're already in the message string, this is from the next element 531 break 532 if line[1] == '.': 533 self.automaticcomments.append(line) 534 elif line[1] == ':': 535 self.sourcecomments.append(line) 536 elif line[1] == ',': 537 self.typecomments.append(line) 538 elif line[1] == '~': 539 line = line[3:] 540 self.obsolete = True 541 else: 542 self.othercomments.append(line) 543 if line.startswith('msgid_plural'): 544 inmsgctxt = 0 545 inmsgid = 0 546 inmsgid_plural = 1 547 inmsgstr = 0 548 inmsgid_comment = 0 549 elif line.startswith('msgctxt'): 550 inmsgctxt = 1 551 inmsgid = 0 552 inmsgid_plural = 0 553 inmsgstr = 0 554 inmsgid_comment = 0 555 elif line.startswith('msgid'): 556 # if we just finished a msgstr or msgid_plural, there is probably an 557 # empty line missing between the units, so let's stop the parsing now. 558 if inmsgstr or inmsgid_plural: 559 break 560 inmsgctxt = 0 561 inmsgid = 1 562 inmsgid_plural = 0 563 inmsgstr = 0 564 inmsgid_comment = 0 565 elif line.startswith('msgstr'): 566 inmsgctxt = 0 567 inmsgid = 0 568 inmsgid_plural = 0 569 inmsgstr = 1 570 if line.startswith('msgstr['): 571 msgstr_pluralid = int(line[len('msgstr['):line.find(']')].strip()) 572 else: 573 msgstr_pluralid = None 574 extracted = quote.extractstr(line) 575 if not extracted is None: 576 if inmsgctxt: 577 self.msgctxt.append(extracted) 578 elif inmsgid: 579 # TODO: improve kde comment detection 580 if extracted.find("_:") != -1: 581 inmsgid_comment = 1 582 if inmsgid_comment: 583 self.msgidcomments.append(extracted) 584 else: 585 self.msgid.append(extracted) 586 if inmsgid_comment and extracted.find("\\n") != -1: 587 inmsgid_comment = 0 588 elif inmsgid_plural: 589 if extracted.find("_:") != -1: 590 inmsgid_comment = 1 591 if inmsgid_comment: 592 self.msgid_pluralcomments.append(extracted) 593 else: 594 self.msgid_plural.append(extracted) 595 if inmsgid_comment and extracted.find("\\n") != -1: 596 inmsgid_comment = 0 597 elif inmsgstr: 598 if msgstr_pluralid is None: 599 self.msgstr.append(extracted) 600 else: 601 if type(self.msgstr) == list: 602 self.msgstr = {0: self.msgstr} 603 if msgstr_pluralid not in self.msgstr: 604 self.msgstr[msgstr_pluralid] = [] 605 self.msgstr[msgstr_pluralid].append(extracted) 606 if self.obsolete: 607 self.makeobsolete() 608 # If this unit is the header, we have to get the encoding to ensure that no 609 # methods are called that need the encoding before we obtained it. 610 if self.isheader(): 611 charset = re.search("charset=([^\\s]+)", unquotefrompo(self.msgstr)) 612 if charset: 613 self._encoding = encodingToUse(charset.group(1)) 614 return linesprocessed
615
616 - def _getmsgpartstr(self, partname, partlines, partcomments=""):
617 if isinstance(partlines, dict): 618 partkeys = partlines.keys() 619 partkeys.sort() 620 return "".join([self._getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys]) 621 partstr = partname + " " 622 partstartline = 0 623 if len(partlines) > 0 and len(partcomments) == 0: 624 partstr += partlines[0] 625 partstartline = 1 626 elif len(partcomments) > 0: 627 if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0: 628 # if there is a blank leader line, it must come before the comment 629 partstr += partlines[0] + '\n' 630 # but if the whole string is blank, leave it in 631 if len(partlines) > 1: 632 partstartline += 1 633 else: 634 # All partcomments should start on a newline 635 partstr += '""\n' 636 # combine comments into one if more than one 637 if len(partcomments) > 1: 638 combinedcomment = [] 639 for comment in partcomments: 640 comment = unquotefrompo([comment]) 641 if comment.startswith("_:"): 642 comment = comment[len("_:"):] 643 if comment.endswith("\\n"): 644 comment = comment[:-len("\\n")] 645 #Before we used to strip. Necessary in some cases? 646 combinedcomment.append(comment) 647 partcomments = quoteforpo("_:%s" % "".join(combinedcomment)) 648 # comments first, no blank leader line needed 649 partstr += "\n".join(partcomments) 650 partstr = quote.rstripeol(partstr) 651 else: 652 partstr += '""' 653 partstr += '\n' 654 # add the rest 655 for partline in partlines[partstartline:]: 656 partstr += partline + '\n' 657 return partstr
658
659 - def _encodeifneccessary(self, output):
660 """encodes unicode strings and returns other strings unchanged""" 661 if isinstance(output, unicode): 662 encoding = encodingToUse(getattr(self, "encoding", "UTF-8")) 663 return output.encode(encoding) 664 return output
665
666 - def __str__(self):
667 """convert to a string. double check that unicode is handled somehow here""" 668 output = self._getoutput() 669 return self._encodeifneccessary(output)
670
671 - def _getoutput(self):
672 """return this po element as a string""" 673 lines = [] 674 lines.extend(self.othercomments) 675 if self.isobsolete(): 676 lines.extend(self.typecomments) 677 obsoletelines = [] 678 if self.obsoletemsgctxt: 679 obsoletelines.append(self._getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt)) 680 obsoletelines.append(self._getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments)) 681 if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments: 682 obsoletelines.append(self._getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments)) 683 obsoletelines.append(self._getmsgpartstr("#~ msgstr", self.obsoletemsgstr)) 684 for index, obsoleteline in enumerate(obsoletelines): 685 # We need to account for a multiline msgid or msgstr here 686 obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "') 687 lines.extend(obsoletelines) 688 lines = [self._encodeifneccessary(line) for line in lines] 689 return "".join(lines) 690 # if there's no msgid don't do msgid and string, unless we're the header 691 # this will also discard any comments other than plain othercomments... 692 if (len(self.msgid) == 0) or ((len(self.msgid) == 1) and (self.msgid[0] == '""')): 693 if not (self.isheader() or self.msgidcomments or self.sourcecomments): 694 return "".join(lines) 695 lines.extend(self.automaticcomments) 696 lines.extend(self.sourcecomments) 697 lines.extend(self.typecomments) 698 if self.msgctxt: 699 lines.append(self._getmsgpartstr("msgctxt", self.msgctxt)) 700 lines.append(self._getmsgpartstr("msgid", self.msgid, self.msgidcomments)) 701 if self.msgid_plural or self.msgid_pluralcomments: 702 lines.append(self._getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments)) 703 lines.append(self._getmsgpartstr("msgstr", self.msgstr)) 704 lines = [self._encodeifneccessary(line) for line in lines] 705 postr = "".join(lines) 706 return postr
707
708 - def getlocations(self):
709 """Get a list of locations from sourcecomments in the PO unit 710 711 rtype: List 712 return: A list of the locations with '#: ' stripped 713 714 """ 715 locations = [] 716 for sourcecomment in self.sourcecomments: 717 locations += quote.rstripeol(sourcecomment)[3:].split() 718 return locations
719
720 - def addlocation(self, location):
721 """Add a location to sourcecomments in the PO unit 722 723 @param location: Text location e.g. 'file.c:23' does not include #: 724 @type location: String 725 726 """ 727 self.sourcecomments.append("#: %s\n" % location)
728
729 - def _extract_msgidcomments(self, text=None):
730 """Extract KDE style msgid comments from the unit. 731 732 @rtype: String 733 @return: Returns the extracted msgidcomments found in this unit's msgid. 734 735 """ 736 737 if not text: 738 text = unquotefrompo(self.msgidcomments) 739 return text.split('\n')[0].replace('_: ', '', 1)
740
741 - def getcontext(self):
742 """Get the message context.""" 743 return unquotefrompo(self.msgctxt) + self._extract_msgidcomments()
744
745 - def getid(self):
746 """Returns a unique identifier for this unit.""" 747 context = self.getcontext() 748 # Gettext does not consider the plural to determine duplicates, only 749 # the msgid. For generation of .mo files, we might want to use this 750 # code to generate the entry for the hash table, but for now, it is 751 # commented out for conformance to gettext. 752 # id = '\0'.join(self.source.strings) 753 id = self.source 754 if self.msgidcomments: 755 id = "_: %s\n%s" % (context, id) 756 elif context: 757 id = "%s\04%s" % (context, id) 758 return id
759
760 -class pofile(pocommon.pofile):
761 """this represents a .po file containing various units""" 762 UnitClass = pounit
763 - def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
764 """construct a pofile, optionally reading in from inputfile. 765 encoding can be specified but otherwise will be read from the PO header""" 766 self.UnitClass = unitclass 767 pocommon.pofile.__init__(self, unitclass=unitclass) 768 self.units = [] 769 self.filename = '' 770 self._encoding = encodingToUse(encoding) 771 if inputfile is not None: 772 self.parse(inputfile)
773
774 - def changeencoding(self, newencoding):
775 """changes the encoding on the file""" 776 self._encoding = encodingToUse(newencoding) 777 if not self.units: 778 return 779 header = self.header() 780 if not header or header.isblank(): 781 return 782 charsetline = None 783 headerstr = unquotefrompo(header.msgstr, True) 784 for line in headerstr.split("\\n"): 785 if not ":" in line: continue 786 key, value = line.strip().split(":", 1) 787 if key.strip() != "Content-Type": continue 788 charsetline = line 789 if charsetline is None: 790 headerstr += "Content-Type: text/plain; charset=%s" % self._encoding 791 else: 792 charset = re.search("charset=([^ ]*)", charsetline) 793 if charset is None: 794 newcharsetline = charsetline 795 if not newcharsetline.strip().endswith(";"): 796 newcharsetline += ";" 797 newcharsetline += " charset=%s" % self._encoding 798 else: 799 charset = charset.group(1) 800 newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1) 801 headerstr = headerstr.replace(charsetline, newcharsetline, 1) 802 header.msgstr = quoteforpo(headerstr)
803
804 - def parse(self, input):
805 """parses the given file or file source string""" 806 if hasattr(input, 'name'): 807 self.filename = input.name 808 elif not getattr(self, 'filename', ''): 809 self.filename = '' 810 if hasattr(input, "read"): 811 posrc = input.read() 812 input.close() 813 input = posrc 814 # TODO: change this to a proper parser that doesn't do line-by-line madness 815 lines = input.split("\n") 816 start = 0 817 end = 0 818 # make only the first one the header 819 linesprocessed = 0 820 while end <= len(lines): 821 if (end == len(lines)) or (not lines[end].strip()): # end of lines or blank line 822 newpe = self.UnitClass(encoding=self._encoding) 823 linesprocessed = newpe.parse("\n".join(lines[start:end])) 824 start += linesprocessed 825 # TODO: find a better way of working out if we actually read anything 826 if linesprocessed >= 1 and newpe._getoutput(): 827 self.units.append(newpe) 828 if newpe.isheader(): 829 if "Content-Type" in self.parseheader(): 830 self._encoding = newpe._encoding 831 # now that we know the encoding, decode the whole file 832 if self._encoding is not None and self._encoding.lower() != 'charset': 833 lines = self.decode(lines) 834 if self._encoding is None: #still have not found an encoding, let's assume UTF-8 835 #TODO: This might be dead code 836 self._encoding = 'utf-8' 837 lines = self.decode(lines) 838 self.units = [] 839 start = 0 840 end = 0 841 end = end+1
842
843 - def removeduplicates(self, duplicatestyle="merge"):
844 """make sure each msgid is unique ; merge comments etc from duplicates into original""" 845 msgiddict = {} 846 uniqueunits = [] 847 # we sometimes need to keep track of what has been marked 848 # TODO: this is using a list as the pos aren't hashable, but this is slow... 849 markedpos = [] 850 def addcomment(thepo): 851 thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations())) 852 markedpos.append(thepo)
853 for thepo in self.units: 854 if duplicatestyle.startswith("msgid_comment"): 855 msgid = unquotefrompo(thepo.msgidcomments) + unquotefrompo(thepo.msgid) 856 else: 857 msgid = unquotefrompo(thepo.msgid) 858 if thepo.isheader(): 859 # header msgids shouldn't be merged... 860 uniqueunits.append(thepo) 861 elif duplicatestyle == "msgid_comment_all": 862 addcomment(thepo) 863 uniqueunits.append(thepo) 864 elif msgid in msgiddict: 865 if duplicatestyle == "merge": 866 if msgid: 867 msgiddict[msgid].merge(thepo) 868 else: 869 addcomment(thepo) 870 uniqueunits.append(thepo) 871 elif duplicatestyle == "keep": 872 uniqueunits.append(thepo) 873 elif duplicatestyle == "msgid_comment": 874 origpo = msgiddict[msgid] 875 if origpo not in markedpos: 876 addcomment(origpo) 877 addcomment(thepo) 878 uniqueunits.append(thepo) 879 elif duplicatestyle == "msgctxt": 880 origpo = msgiddict[msgid] 881 if origpo not in markedpos: 882 origpo.msgctxt.append('"%s"' % " ".join(origpo.getlocations())) 883 markedpos.append(thepo) 884 thepo.msgctxt.append('"%s"' % " ".join(thepo.getlocations())) 885 uniqueunits.append(thepo) 886 else: 887 if not msgid and duplicatestyle != "keep": 888 addcomment(thepo) 889 msgiddict[msgid] = thepo 890 uniqueunits.append(thepo) 891 self.units = uniqueunits
892
893 - def __str__(self):
894 """convert to a string. double check that unicode is handled somehow here""" 895 output = self._getoutput() 896 if isinstance(output, unicode): 897 return output.encode(getattr(self, "encoding", "UTF-8")) 898 return output
899
900 - def _getoutput(self):
901 """convert the units back to lines""" 902 lines = [] 903 for unit in self.units: 904 unitsrc = str(unit) + "\n" 905 lines.append(unitsrc) 906 lines = "".join(self.encode(lines)).rstrip() 907 #After the last pounit we will have \n\n and we only want to end in \n: 908 if lines: lines += "\n" 909 return lines
910
911 - def encode(self, lines):
912 """encode any unicode strings in lines in self._encoding""" 913 newlines = [] 914 encoding = self._encoding 915 if encoding is None or encoding.lower() == "charset": 916 encoding = 'UTF-8' 917 for line in lines: 918 if isinstance(line, unicode): 919 line = line.encode(encoding) 920 newlines.append(line) 921 return newlines
922
923 - def decode(self, lines):
924 """decode any non-unicode strings in lines with self._encoding""" 925 newlines = [] 926 for line in lines: 927 if isinstance(line, str) and self._encoding is not None and self._encoding.lower() != "charset": 928 try: 929 line = line.decode(self._encoding) 930 except UnicodeError, e: 931 raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self._encoding, e, line)) 932 newlines.append(line) 933 return newlines
934
935 - def unit_iter(self):
936 for unit in self.units: 937 if not (unit.isheader() or unit.isobsolete()): 938 yield unit
939 940 if __name__ == '__main__': 941 import sys 942 pf = pofile(sys.stdin) 943 sys.stdout.write(str(pf)) 944