s3.s3pdf

1 # -*- coding: utf-8 -*- 2 3 """ Resource PDF Tools 4 5 @see: U{B{I{S3XRC}} <http://eden.sahanafoundation.org/wiki/S3XRC>} 6 7 @requires: U{B{I{ReportLab}} <http://www.reportlab.com>} 8 9 ###################################################################### 10 DEPRECATION WARNING 11 12 This class is being replaced by the S3RL_PDF codec 13 14 Initially the reporting features will be replaced, with the OCR 15 process being removed at a later stage. 16 ###################################################################### 17 18 @copyright: 2011-2019 (c) Sahana Software Foundation 19 @license: MIT 20 21 Permission is hereby granted, free of charge, to any person 22 obtaining a copy of this software and associated documentation 23 files (the "Software"), to deal in the Software without 24 restriction, including without limitation the rights to use, 25 copy, modify, merge, publish, distribute, sublicense, and/or sell 26 copies of the Software, and to permit persons to whom the 27 Software is furnished to do so, subject to the following 28 conditions: 29 30 The above copyright notice and this permission notice shall be 31 included in all copies or substantial portions of the Software. 32 33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 34 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 35 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 36 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 37 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 38 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 39 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 40 OTHER DEALINGS IN THE SOFTWARE. 41 """ 42 43 __all__ = ("S3PDF",) 44 45 import json 46 import math 47 import os 48 import re 49 import sys 50 import subprocess 51 import unicodedata 52 53 from copy import deepcopy 54 try: 55 from cStringIO import StringIO # Faster, where available 56 except: 57 from StringIO import StringIO 58 from datetime import datetime, timedelta, date 59 # Not using soupparser's unescape for now as it adds BeautifulSoup module 60 # to the dependency list for just one utility 61 #from lxml.html.soupparser import unescape 62 from htmlentitydefs import name2codepoint 63 64 from gluon import * 65 from gluon.storage import Storage 66 from gluon.contenttype import contenttype 67 from gluon.languages import lazyT 68 69 try: 70 from lxml import etree 71 except ImportError: 72 sys.stderr.write("ERROR: lxml module needed for XML handling\n") 73 raise 74 75 from s3datetime import S3DateTime 76 from s3rest import S3Method 77 from s3utils import s3_represent_value, s3_validate 78 import s3codec 79 80 try: 81 from PIL import Image 82 from PIL import ImageOps 83 from PIL import ImageStat 84 PILImported = True 85 except(ImportError): 86 try: 87 import Image 88 import ImageOps 89 import ImageStat 90 PILImported = True 91 except(ImportError): 92 sys.stderr.write("S3 Debug: S3PDF: Python Image Library not installed\n") 93 PILImported = False 94 try: 95 from reportlab.lib.enums import TA_CENTER, TA_RIGHT 96 from reportlab.pdfbase import pdfmetrics 97 98 from reportlab.pdfgen import canvas 99 from reportlab.lib.fonts import tt2ps 100 from reportlab.rl_config import canvas_basefontname as _baseFontName 101 from reportlab.platypus import BaseDocTemplate, SimpleDocTemplate, PageTemplate 102 from reportlab.platypus.frames import Frame 103 from reportlab.platypus import Spacer, PageBreak, Paragraph 104 from reportlab.platypus import Table, TableStyle 105 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle 106 from reportlab.lib.units import inch 107 from reportlab.lib.units import cm 108 from reportlab.lib import colors 109 from reportlab.lib.colors import Color 110 from reportlab.lib.pagesizes import A4, LETTER, landscape, portrait 111 from reportlab.platypus.flowables import Flowable 112 reportLabImported = True 113 except ImportError: 114 sys.stderr.write("S3 Debug: S3PDF: Reportlab not installed\n") 115 reportLabImported = False 116 117 # Maximum number of options a field can have 118 MAX_FORM_OPTIONS_LIMIT = 12 119 120 # Will be loaded with values during S3PDF apply_method 121 ERROR = Storage()

122 123 # ============================================================================= 124 -def checkDependencies(r):

125 T = current.T 126 ERROR = Storage( 127 PIL_ERROR=T("PIL (Python Image Library) not installed"), 128 REPORTLAB_ERROR=T("ReportLab not installed"), 129 ) 130 # Check that the necessary reportLab classes were imported 131 if not reportLabImported: 132 r.error(501, ERROR.REPORTLAB_ERROR) 133 if not PILImported: 134 r.error(501, ERROR.PIL_ERROR)

135 # redirect() is not available in this scope 136 #current.session.error = self.ERROR.REPORTLAB_ERROR 137 #redirect(URL(extension="")) 138 139 140 # ============================================================================= 141 if reportLabImported:

142 143 # ========================================================================= 144 - class ChangePageTitle(Flowable):

145 - def __init__(self, doc, newTitle):

146 Flowable.__init__(self) 147 self.doc = doc 148 self.title = newTitle

149

150 - def draw(self):

151 self.doc.title = self.title

152

153 # ========================================================================= 154 - class Overlay(Flowable):

155 - def __init__(self, callback, data):

156 Flowable.__init__(self) 157 self.function = callback 158 self.data = data

159

160 - def draw(self):

161 self.function(self.canv, self.data)

162

163 # ========================================================================= 164 - class EdenDocTemplate(BaseDocTemplate):

165 """ 166 The standard document template for eden reports 167 It allows for the following page templates: 168 1) First Page 169 2) Even Page 170 3) Odd Page 171 4) Landscape Page 172 """ 173 174 # ---------------------------------------------------------------------

175 - def setPageTemplates(self, 176 first, 177 firstEnd, 178 even = None, 179 odd = None, 180 landscape = None, 181 ):

182 """ 183 Determine which page template to use 184 """ 185 186 self.onfirst = first 187 self.onfirstEnd = firstEnd 188 if even: 189 self.oneven = even 190 else: 191 self.oneven = first 192 if odd: 193 self.onodd = odd 194 else: 195 self.onodd = first 196 if landscape: 197 self.onlandscape = landscape 198 else: 199 self.onlandscape = first 200 self.needLandscape = False

201 202 # ---------------------------------------------------------------------

203 - def handle_pageBegin(self):

204 """ 205 Determine which page template to use 206 """ 207 208 self._handle_pageBegin() 209 if self.needLandscape: 210 self._handle_nextPageTemplate("landscape") 211 elif self.page %2 == 1: 212 self._handle_nextPageTemplate("odd") 213 else: 214 self._handle_nextPageTemplate("even")

215 216 # ---------------------------------------------------------------------

217 - def build(self, flowables, canvasmaker=canvas.Canvas):

218 """ 219 Build the document using the flowables. 220 221 Set up the page templates that the document can use 222 223 """ 224 225 self._calc() # in case we changed margins sizes etc 226 showBoundary = 0 # for debugging set to 1 227 frameT = Frame(self.leftMargin, 228 self.bottomMargin, 229 self.width, 230 self.height, 231 id="body", 232 showBoundary = showBoundary) 233 self.addPageTemplates([PageTemplate(id="first", 234 frames=frameT, 235 onPage=self.onfirst, 236 onPageEnd=self.onfirstEnd, 237 pagesize=self.pagesize), 238 PageTemplate(id="even", 239 frames=frameT, 240 onPage=self.oneven, 241 onPageEnd=self.onfirstEnd, 242 pagesize=self.pagesize), 243 PageTemplate(id="odd", 244 frames=frameT, 245 onPage=self.onodd, 246 onPageEnd=self.onfirstEnd, 247 pagesize=self.pagesize), 248 PageTemplate(id="landscape", 249 frames=frameT, 250 onPage=self.onlandscape, 251 pagesize=self.pagesize), 252 ]) 253 BaseDocTemplate.build(self, flowables, canvasmaker=canvasmaker)

254

255 # ============================================================================= 256 -class S3PDF(S3Method):

257 """ 258 Class to help generate PDF documents. 259 260 A typical implementation would be as follows: 261 262 exporter = s3base.S3PDF() 263 return exporter(xrequest, **attr) 264 265 Currently this class supports two types of reports: 266 A List: Typically called from the icon shown in a search 267 For example inv/warehouse 268 A Header plus List: Typically called from a button on a form 269 For example ??? 270 271 Add additional generic forms to the apply_method() function 272 For specialist forms a S3PDF() object will need to be created. 273 See the apply_method() for ideas on how to create a form, 274 but as a minimum the following structure is required: 275 276 pdf = S3PDF() 277 pdf.newDocument(pdf.defaultTitle(resource)) 278 279 # Add specific pages here 280 281 return pdf.buildDoc() 282 """ 283 284 # -------------------------------------------------------------------------

285 - def apply_method(self, r, **attr):

286 """ 287 Apply CRUD methods 288 289 @param r: the S3Request 290 @param attr: dictionary of parameters for the method handler 291 The attributes that it knows about are: 292 * componentname 293 * formname 294 * list_fields 295 * report_groupby 296 * report_hide_comments 297 298 @return: output object to send to the view 299 """ 300 301 # --------------------------------------------------------------------- 302 def getParam(key): 303 """ 304 nested function to get the parameters passed into apply_method 305 306 @todo find out if this has been done better elsewhere! :( 307 308 This will first try and get the argument from the attr parameter, 309 if it's not here then try self._config() 310 """ 311 value = attr.get(key) 312 if value != None: 313 return value 314 return self._config(key)

315 316 T = current.T 317 self.ERROR = ERROR = Storage( 318 NO_RECORDS=T("No records in this resource. Add one more records manually and then retry."), 319 TESSERACT_ERROR=T("%(app)s not installed. Ask the Server Administrator to install on Server.") % dict(app="Tesseract 3.01"), 320 EMPTY_OCR_FORM=T("Selected OCR Form has no pages. Use another revision of create a new revision by downloading a new Form."), 321 INVALID_IMAGE_TYPE=T("Uploaded file(s) are not Image(s). Supported image formats are '.png', '.jpg', '.bmp', '.gif'."), 322 OCR_DISABLED=T("OCR module is disabled. Ask the Server Administrator to enable it."), 323 IMAGE_MAGICK_ERROR=T("%(app)s not installed. Ask the Server Administrator to install on Server.") % dict(app="ImageMagick"), 324 NOT_PDF_FILE=T("Uploaded file is not a PDF file. Provide a Form in valid PDF Format."), 325 INVALID_PDF=T("Uploaded PDF file has more/less number of page(s) than required. Check if you have provided appropriate revision for your Form as well as check the Form contains appropriate number of pages."), 326 NO_UTC_OFFSET=T("No UTC offset found. Please set UTC offset in your 'User Profile' details. Example: UTC+0530"), 327 INVALID_JOBID=T("The provided 'jobuuid' is invalid. The session of Form upload is invalid. You should retry uploading."), 328 INVALID_FORMID=T("The provided 'formuuid' is invalid. You have selected a Form revision which does not exist on this server."), 329 UNRECOVERABLE_ERROR=T("The uploaded Form is unreadable, please do manual data entry."), 330 JOB_COMPLETE=T("This job has already been finished successfully."), 331 ) 332 333 self.r = r 334 checkDependencies(r) 335 settings = current.deployment_settings 336 request = current.request 337 response = current.response 338 session = current.session 339 db = current.db 340 341 if DEBUG: 342 content_disposition = "inline" 343 else: 344 content_disposition = "attachment" 345 346 if settings.get_pdf_size() == "Letter": 347 self.paper_size = LETTER 348 else: 349 self.paper_size = A4 350 351 try: 352 self.logo = os.path.join(request.folder, 353 settings.get_pdf_logo()) 354 except: 355 self.logo = None 356 self.headerBanner = None 357 358 method = self.method 359 360 callback = getParam("callback") 361 if callback != None: 362 title = getParam("formname") 363 if title == None: 364 title = self.defaultTitle(self.resource) 365 header = getParam("header") 366 if header == None: 367 header = self.pageHeader 368 footer = getParam("footer") 369 if footer == None: 370 footer = self.pageFooter 371 filename = getParam("filename") 372 if filename == None: 373 filename = title 374 self.newDocument(title, 375 header=header, 376 footer=footer, 377 filename = filename) 378 try: 379 id = r.component_id 380 if id == None: 381 id = r.id 382 except: 383 try: 384 id = r.id 385 except: 386 id = None 387 388 callback(self, id=id) 389 # Build the document 390 doc = self.buildDoc() 391 # Set content type and disposition headers 392 if response: 393 response.headers["Content-Type"] = contenttype(".pdf") 394 response.headers["Content-disposition"] = \ 395 "%s; filename=\"%s\"" % (content_disposition, 396 self.filename) 397 398 # Return the stream 399 return doc 400 401 elif r.http == "GET": 402 if self.method in ("read", "list"): 403 # Normal PDF output 404 # Get the configuration parameters 405 componentname = getParam("componentname") 406 title = getParam("formname") 407 list_fields = getParam("list_fields") 408 report_groupby = getParam("report_groupby") 409 report_hide_comments = getParam("report_hide_comments") 410 filename = getParam("filename") 411 if filename == None: 412 filename = title 413 414 # Create the document shell 415 if title == None: 416 title = self.defaultTitle(self.resource) 417 self.newDocument(title, 418 header=self.pageHeader, 419 footer=self.pageFooter, 420 filename = filename) 421 422 if "report_landscape" in attr: 423 self.setLandscape() 424 # get the header details, if appropriate 425 if "rheader" in attr and attr["rheader"]: 426 self.extractrHeader(attr["rheader"]) 427 self.addSpacer(3) 428 elif componentname: 429 self.addrHeader(self.resource, 430 list_fields, 431 report_hide_comments=report_hide_comments) 432 self.addSpacer(3) 433 # Add details to the document 434 if componentname == None: 435 # Document that only has a resource list 436 self.addTable(self.resource, 437 list_fields=list_fields, 438 report_groupby=report_groupby, 439 report_hide_comments=report_hide_comments) 440 else: 441 # Document that has a resource header and component list 442 # Get the raw data for the component 443 ptable = self.resource.table 444 ctable = db[componentname] 445 raw_data = [] 446 linkfield = None 447 for link in ptable._referenced_by: 448 if link[0] == componentname: 449 linkfield = link[1] 450 break 451 if linkfield != None: 452 query = ctable[linkfield] == self.record_id 453 records = db(query).select() 454 find_fields = [] 455 for component in self.resource.components.values(): 456 find_fields += component.readable_fields() 457 fields = [] 458 if list_fields: 459 for lf in list_fields: 460 for field in find_fields: 461 if field.name == lf: 462 fields.append(field) 463 break 464 else: 465 for field in find_fields: 466 if field.type == "id": 467 continue 468 if report_hide_comments and field.name == "comments": 469 continue 470 fields.append(field) 471 if not fields: 472 fields = [table.id] 473 label_fields = [f.label for f in fields] 474 475 for record in records: 476 data = [] 477 for field in fields: 478 value = record[field.name] 479 text = s3_represent_value(field, 480 value=value, 481 strip_markup=True, 482 non_xml_output=True, 483 extended_comments=True 484 ) 485 data.append(text) 486 raw_data.append(data) 487 self.addTable(raw_data = raw_data, 488 list_fields=label_fields) 489 490 if "report_footer" in attr: 491 self.addSpacer(3) 492 self.extractrHeader(attr["report_footer"]) 493 # Build the document 494 doc = self.buildDoc() 495 496 # Set content type and disposition headers 497 if response: 498 response.headers["Content-Type"] = contenttype(".pdf") 499 response.headers["Content-disposition"] = \ 500 "%s; filename=\"%s\"" % (content_disposition, 501 self.filename) 502 503 # Return the stream 504 return doc 505 506 elif method == "create": 507 if current.deployment_settings.has_module("ocr"): 508 # Create an OCR PDF form 509 import uuid 510 formUUID = uuid.uuid1() 511 self.newOCRForm(formUUID) 512 513 # Put values 514 self.OCRPDFManager() 515 516 # Build the document 517 doc = self.buildDoc() 518 numPages = self.doc.numPages 519 layoutXML = self.__getOCRLayout() 520 self.__update_dbmeta(formUUID, layoutXML, numPages) 521 522 # Set content type and disposition headers 523 if response: 524 response.headers["Content-Type"] = contenttype(".pdf") 525 response.headers["Content-disposition"] = \ 526 "%s; filename=\"%s\"" % (content_disposition, 527 self.filename) 528 529 # Return the stream 530 return doc 531 532 else: 533 # @ToDo: Produce a simple form 534 r.error(501, self.ERROR.OCR_DISABLED) 535 536 elif method == "import": 537 # Render a review UI 538 if not current.deployment_settings.has_module("ocr"): 539 r.error(501, self.ERROR.OCR_DISABLED) 540 541 authorised = self._permitted(method="create") 542 if not authorised: 543 r.unauthorised() 544 545 try: 546 if r.component: 547 trigger = r.args[3] 548 else: 549 trigger = r.args[1] 550 except(IndexError): 551 trigger = None 552 553 if trigger == "review": 554 try: 555 jobuuid = r.vars["jobuuid"] 556 except(KeyError): 557 r.error(501, current.ERROR.BAD_REQUEST) 558 559 # Check if operation is valid on the given job_uuid 560 current.s3db.table("ocr_meta") 561 statustable = db.ocr_form_status 562 query = (statustable.job_uuid == jobuuid) 563 row = db(query).select(statustable.review_status, 564 statustable.job_has_errors, 565 statustable.image_set_uuid, 566 statustable.form_uuid, 567 limitby=(0, 1)).first() 568 if not row: 569 # No such job 570 r.error(501, self.ERROR.INVALID_JOBID) 571 572 if row.review_status == 1: 573 # Job has already been reviewed 574 r.error(501, self.ERROR.JOB_COMPLETE) 575 576 # Retrieve meta data 577 if row.job_has_errors == 1: 578 job_has_errors = True 579 else: 580 job_has_errors = False 581 582 self.setuuid = row.image_set_uuid 583 584 # Retrieve s3ocrxml 585 formuuid = row.form_uuid 586 metatable = db.ocr_meta 587 row = db(metatable.form_uuid == formuuid).select(metatable.s3ocrxml_file, 588 limitby=(0, 1)).first() 589 if not row: 590 r.error(501, self.ERROR.INVALID_FORMID) 591 592 s3ocrxml_filename = row.s3ocrxml_file 593 f = open(os.path.join(r.folder, 594 "uploads", 595 "ocr_meta", 596 s3ocrxml_filename), 597 "rb") 598 s3ocrxml = f.read() 599 f.close() 600 601 s3ocrdict = self.__s3ocrxml2dict(s3ocrxml) 602 603 # Retrieve the job 604 import_job = self.resource.import_tree(None, None, 605 job_id=jobuuid, 606 commit_job=False, 607 ignore_errors=True) 608 609 s3import_enabled = True 610 if s3import_enabled: 611 s3ocrdata = self.__importjob2data(import_job) 612 613 else: 614 # Retrive s3ocr data xml 615 table = db.ocr_data_xml 616 query = (table.image_set_uuid == self.setuuid) 617 row = db(query).select(table.data_file, 618 limitby=(0, 1)).first() 619 620 if not row: 621 r.error(501, current.ERROR.BAD_RECORD) 622 623 s3ocrdataxml_filename = row.data_file 624 f = open(os.path.join(r.folder, 625 "uploads", 626 "ocr_payload", 627 s3ocrdataxml_filename), 628 "rb") 629 s3ocrdataxml = f.read() 630 f.close() 631 632 s3ocrdata = self.__temp_ocrdataxml_parser(s3ocrdataxml) 633 634 reviewform = self.__create_review_form(s3ocrdict, s3ocrdata) 635 636 return response.render("_ocr_review.html", 637 dict(reviewform=reviewform) 638 ) 639 640 elif trigger == "image": 641 # Do import job 642 try: 643 setuuid = r.vars["setuuid"] 644 resource_table = r.vars["resource_table"] 645 field_name = r.vars["field_name"] 646 except(KeyError): 647 r.error(501, current.ERROR.BAD_REQUEST) 648 649 try: 650 value = r.vars["value"] 651 except(KeyError): 652 value = None 653 try: 654 sequence = r.vars["sequence"] 655 except(KeyError): 656 r.error(501, current.ERROR.BAD_REQUEST) 657 658 # Load ocr tables 659 current.s3db.table("ocr_meta") 660 table = db.ocr_field_crops 661 if value: 662 query = (table.image_set_uuid == setuuid) & \ 663 (table.resource_table == resource_table) & \ 664 (table.field_name == field_name) & \ 665 (table.value == value) 666 row = db(query).select(table.image_file, 667 limitby=(0, 1)).first() 668 else: 669 query = (table.image_set_uuid == setuuid) & \ 670 (table.resource_table == resource_table) & \ 671 (table.field_name == field_name) & \ 672 (table.sequence == sequence) 673 row = db(query).select(table.image_file, 674 limitby=(0, 1)).first() 675 if not row: 676 r.error(501, current.ERROR.BAD_RECORD) 677 678 format = row.image_file[-4:] 679 image_file = open(os.path.join(r.folder, 680 "uploads", 681 "ocr_payload", 682 row.image_file)) 683 image_file_content = image_file.read() 684 image_file.close() 685 # Set content type and disposition headers 686 if response: 687 response.headers["Content-Type"] = contenttype(format) 688 response.headers["Content-disposition"] = \ 689 "%s; filename=\"%s\"" % ("inline", 690 "tempimage%s" % format) 691 692 # Return the stream 693 return image_file_content 694 695 elif trigger == "import": 696 # Do import job 697 try: 698 setuuid = r.vars["setuuid"] 699 except(KeyError): 700 r.error(501, current.ERROR.BAD_REQUEST) 701 702 # Check if operation is valid on the given set_uuid 703 statustable = s3db.ocr_form_status 704 query = (statustable.image_set_uuid == setuuid) 705 row = db(query).select(statustable.job_uuid, 706 limitby=(0, 1)).first() 707 if row: 708 # This set of images has already been imported 709 jobuuid = row.job_uuid 710 711 if r.component: 712 # If component 713 request_args = request.get("args", ["", ""]) 714 record_id = request_args[0] 715 component_name = request_args[1] 716 urlprefix = "%s/%s/%s" % (request.function, 717 record_id, 718 component_name) 719 else: 720 # Not a component 721 urlprefix = request.function 722 723 redirect(URL(request.controller, 724 "%s/upload.pdf" % urlprefix, 725 args="review", 726 vars={"jobuuid":jobuuid})) 727 728 table = db.ocr_data_xml 729 row = db(table.image_set_uuid == setuuid).select(table.data_file, 730 table.form_uuid, 731 limitby=(0, 1) 732 ).first() 733 if not row: 734 r.error(501, current.ERROR.BAD_RECORD) 735 736 data_file = open(os.path.join(r.folder, 737 "uploads", 738 "ocr_payload", 739 row.data_file)) 740 formuuid = row.form_uuid 741 742 datafile_content = data_file.read() 743 data_file.close() 744 745 metatable = db.ocr_meta 746 row = db(metatable.form_uuid == formuuid).select(metatable.s3ocrxml_file, 747 limitby=(0, 1) 748 ).first() 749 if not row: 750 r.error(501, self.ERROR.INVALID_FORMID) 751 752 s3ocrxml_filename = row.s3ocrxml_file 753 f = open(os.path.join(r.folder, 754 "uploads", 755 "ocr_meta", 756 s3ocrxml_filename), 757 "rb") 758 s3ocrxml = f.read() 759 f.close() 760 761 s3ocrdict = self.__s3ocrxml2dict(s3ocrxml) 762 crosslimit_options = {} 763 for resourcename in s3ocrdict["$resource_seq"]: 764 resource = s3ocrdict[resourcename] 765 for fieldname in resource["$field_seq"]: 766 field = resource[fieldname] 767 if field.has_options: 768 if field.options and \ 769 field.options.count > MAX_FORM_OPTIONS_LIMIT: 770 if not crosslimit_options.has_key(resourcename): 771 crosslimit_options[resourcename] = [fieldname] 772 else: 773 crosslimit_options[resourcename].append(fieldname) 774 775 if len(crosslimit_options) != 0: 776 s3xml_root = etree.fromstring(datafile_content) 777 resource_element = s3xml_root.getchildren()[0] 778 resourcename = resource_element.attrib.get("name") 779 for field in resource_element: 780 if field.tag == "data": 781 if crosslimit_options.has_key(resourcename): 782 fieldname = field.attrib.get("field") 783 if fieldname in crosslimit_options[resourcename]: 784 match_status = {} 785 value = field.text.encode("utf-8").lower() 786 for option in s3ocrdict[resourcename][fieldname].options.list: 787 try: 788 fieldtext = option.label.lower() 789 except: 790 fieldtext = "" 791 match_status[option.value] =\ 792 self.dameraulevenshtein(cast2ascii(fieldtext), 793 cast2ascii(value)) 794 795 closematch_value = 1000000000 796 closematch = [] 797 798 for match in match_status.keys(): 799 if match_status[match] < closematch_value: 800 closematch = [match] 801 closematch_value = match_status[match] 802 elif match_status[match] == closematch_value: 803 closematch.append(match) 804 805 if len(closematch) > 0: 806 value = closematch[0] 807 else: 808 value = "" 809 810 field.text = value 811 field.attrib["value"] = value 812 813 814 elif field.tag == "resource": 815 resourcename = field.attrib.get("name") 816 for subfield in field: 817 if subfield.tag == "data": 818 fieldname = subfield.attrib.get("field") 819 if resourcename in crosslimit_options.keys() and\ 820 fieldname in crosslimit_options[resourcename]: 821 match_status = {} 822 value = subfield.text.encode("utf-8").lower() 823 for option in s3ocrdict[resourcename][fieldname].options.list: 824 try: 825 fieldtext = option.label.lower() 826 except: 827 fieldtext = "" 828 match_status[option.value] =\ 829 self.dameraulevenshtein(cast2ascii(fieldtext), 830 cast2ascii(value)) 831 832 closematch_value = 1000000000 833 closematch = [] 834 835 for match in match_status.keys(): 836 if match_status[match] < closematch_value: 837 closematch = [match] 838 closematch_value = match_status[match] 839 elif match_status[match] == closematch_value: 840 closematch.append(match) 841 842 if len(closematch) > 0: 843 value = closematch[0] 844 else: 845 value = "" 846 847 subfield.text = value 848 subfield.attrib["value"] = value 849 850 datafile_content = etree.tostring(s3xml_root) 851 852 # import_xml routine 853 outputjson = self.resource.import_xml(StringIO(datafile_content), 854 commit_job=False, 855 ignore_errors=True) 856 857 # Get metadata for review 858 jobuuid = self.resource.job.job_id 859 json2dict = json.loads(outputjson, strict=False) 860 861 if json2dict.has_key("message"): 862 jobhaserrors = 1 863 else: 864 jobhaserrors = 0 865 866 # Check status code 867 if json2dict.get("statuscode") != "200": 868 r.error(501, self.ERROR.UNRECOVERABLE_ERROR) 869 870 # Store metadata for review 871 db.ocr_form_status.insert(image_set_uuid=setuuid, 872 form_uuid=formuuid, 873 job_uuid=jobuuid, 874 job_has_errors=jobhaserrors) 875 876 if r.component: 877 request_args = request.get("args", ["", ""]) 878 record_id = request_args[0] 879 component_name = request_args[1] 880 urlprefix = "%s/%s/%s" % (request.function, 881 record_id, 882 component_name) 883 884 else: 885 # Not a component 886 urlprefix = request.function 887 888 redirect(URL(request.controller, 889 "%s/upload.pdf" % urlprefix, 890 args="review", 891 vars={"jobuuid":jobuuid})) 892 893 else: 894 # Render upload UI 895 896 # Check if user has UTC offset in his profile 897 auth = current.auth 898 if auth.user: 899 utc_offset = auth.user.utc_offset 900 else: 901 r.error(501, self.ERROR.NO_UTC_OFFSET) 902 903 # Load OCR tables 904 current.s3db.ocr_meta 905 906 # Create an html image upload form for user 907 formuuid = r.vars.get("formuuid", None) 908 uploadformat = r.vars.get("uploadformat", None) 909 requesturl = request.env.path_info 910 createurl = "%s/create.pdf" %\ 911 requesturl[0:requesturl.rfind("/")] 912 if not (formuuid and uploadformat): 913 availForms = self.__getResourceForms() 914 return response.render("_ocr_upload.html", 915 dict(availForms=availForms, 916 createurl=createurl)) 917 else: 918 try: 919 numpages = self.__getNumPages(formuuid) 920 except: 921 r.error(501, current.ERROR.BAD_RECORD) 922 923 if not numpages: 924 r.error(501, self.ERROR.EMPTY_OCR_FORM) 925 926 return response.render("_ocr_page_upload.html", 927 dict(numpages=numpages, 928 posturl=createurl, 929 formuuid=formuuid, 930 uploadformat=uploadformat)) 931 932 numpages = self.__getNumPages(formuuid) 933 if not numpages: 934 r.error(501, self.ERROR.EMPTY_OCR_FORM) 935 936 return response.render("_ocr_page_upload.html", 937 dict(numpages=numpages, 938 posturl=createurl, 939 formuuid=formuuid, 940 uploadformat=uploadformat)) 941 942 else: 943 r.error(405, current.ERROR.BAD_METHOD) 944 945 elif r.http == "POST": 946 if method == "create": 947 # Upload scanned OCR images 948 if not current.deployment_settings.has_module("ocr"): 949 r.error(501, self.ERROR.OCR_DISABLED) 950 951 # Form meta vars 952 formuuid = r.vars.formuuid 953 numpages = int(r.vars.numpages) 954 uploadformat = r.vars.uploadformat 955 956 # Set id for given form 957 import uuid 958 setuuid = uuid.uuid1() 959 960 # Load model 961 current.s3db.ocr_meta 962 963 # Check for upload format 964 if uploadformat == "image": 965 # store each page into db/disk 966 payloadtable = db.ocr_payload 967 for eachpage in xrange(1, numpages + 1): 968 varname = "page%s" % eachpage 969 fileholder = r.vars[varname] 970 pagenumber = eachpage 971 972 # server side file validation 973 imgfilename = fileholder.filename 974 extension = lambda m: m[m.rfind(".") + 1:] 975 imageformats = ["jpg", "png", "gif", "bmp"] 976 977 if extension(imgfilename) not in imageformats: 978 r.error(501, self.ERROR.INVALID_IMAGE_TYPE) 979 980 # store page 981 payloadtable.insert( 982 image_set_uuid=setuuid, 983 image_file=payloadtable["image_file"].store(\ 984 fileholder.file, 985 fileholder.filename), 986 page_number=pagenumber) 987 988 elif uploadformat == "pdf": 989 fileholder = r.vars["pdffile"] 990 # server side file validation 991 filename = fileholder.filename 992 extension = lambda m: m[m.rfind(".") + 1:] 993 994 if extension(filename) != "pdf": 995 r.error(501, self.ERROR.NOT_PDF_FILE) 996 997 # create temp dir to extract the images 998 uniqueuuid = setuuid # to make it thread safe 999 inputfilename = "%s_%s" % (uniqueuuid, fileholder.filename) 1000 outputfilename = "%s_%s.png" % (uniqueuuid, 1001 fileholder.filename[:-4]) 1002 1003 ocr_temp_dir = os.path.join(self.r.folder, 1004 "uploads", "ocr_temp") 1005 try: 1006 os.mkdir(ocr_temp_dir) 1007 except(OSError): 1008 pass 1009 1010 f = open(os.path.join(ocr_temp_dir, inputfilename), "w") 1011 f.write(fileholder.file.read()) 1012 f.close() 1013 1014 success = subprocess.call(["convert", 1015 os.path.join(ocr_temp_dir, 1016 inputfilename), 1017 os.path.join(ocr_temp_dir, 1018 outputfilename)]) 1019 if success != 0: 1020 self.r.error(501, self.ERROR.IMAGE_MAGICK_ERROR) 1021 1022 # Store each page into db/disk 1023 payloadtable = db.ocr_payload 1024 1025 if numpages == 1: 1026 imagefilename = outputfilename 1027 imgfilepath = os.path.join(ocr_temp_dir, imagefilename) 1028 try: 1029 imgfile = open(imgfilepath) 1030 except(IOError): 1031 self.r.error(501, self.ERROR.INVALID_PDF) 1032 pagenumber = 1 1033 1034 # Store page 1035 payloadtable.insert( 1036 image_set_uuid=setuuid, 1037 image_file=payloadtable["image_file"].store(\ 1038 imgfile, 1039 imagefilename), 1040 page_number=pagenumber) 1041 imgfile.close() 1042 os.remove(imgfilepath) 1043 1044 else: 1045 for eachpage in xrange(0, numpages): 1046 imagefilename = "%s-%s.png" % (outputfilename[:-4], 1047 eachpage) 1048 imgfilepath = os.path.join(ocr_temp_dir, 1049 imagefilename) 1050 try: 1051 imgfile = open(imgfilepath, "r") 1052 except(IOError): 1053 self.r.error(501, self.ERROR.INVALID_PDF) 1054 1055 pagenumber = eachpage + 1 1056 1057 # Store page 1058 payloadtable.insert( 1059 image_set_uuid=setuuid, 1060 image_file=payloadtable["image_file"].store(\ 1061 imgfile, 1062 imagefilename), 1063 page_number=pagenumber) 1064 imgfile.close() 1065 os.remove(imgfilepath) 1066 1067 os.remove(os.path.join(ocr_temp_dir, inputfilename)) 1068 try: 1069 os.rmdir(ocr_temp_dir) 1070 except(OSError): 1071 import shutil 1072 shutil.rmtree(ocr_temp_dir) 1073 1074 else: 1075 r.error(501, self.ERROR.INVALID_IMAGE_TYPE) 1076 1077 # OCR it 1078 s3ocrimageparser = S3OCRImageParser(self, r) 1079 output = s3ocrimageparser.parse(formuuid, setuuid) 1080 1081 table = db.ocr_data_xml 1082 table.insert(image_set_uuid=setuuid, 1083 data_file=table["data_file"].store( 1084 StringIO(output), 1085 "%s-data.xml" % setuuid), 1086 form_uuid=formuuid, 1087 ) 1088 1089 if r.component: 1090 request_args = current.request.get("args", ["", ""]) 1091 record_id = request_args[0] 1092 component_name = request_args[1] 1093 urlprefix = "%s/%s/%s" % (request.function, 1094 record_id, 1095 component_name) 1096 1097 else: 1098 # Not a component 1099 urlprefix = request.function 1100 1101 redirect(URL(request.controller, 1102 "%s/import.pdf" % urlprefix, 1103 args="import", 1104 vars={"setuuid":setuuid})) 1105 1106 elif method == "import": 1107 if not current.deployment_settings.has_module("ocr"): 1108 r.error(501, self.ERROR.OCR_DISABLED) 1109 1110 authorised = self._permitted(method="create") 1111 if not authorised: 1112 r.unauthorised() 1113 1114 try: 1115 if r.component: 1116 trigger = r.args[3] 1117 else: 1118 trigger = r.args[1] 1119 except(IndexError): 1120 trigger = None 1121 1122 if trigger == "review": 1123 # Review UI post 1124 jobuuid = r.vars.pop("jobuuid") 1125 1126 # Check if operation is valid on the given job_uuid 1127 statustable = current.s3db.ocr_form_status 1128 query = (statustable.job_uuid == jobuuid) 1129 row = db(query).select(statustable.review_status, 1130 limitby=(0, 1)).first() 1131 if not row: 1132 r.error(501, self.ERROR.INVALID_JOBID) 1133 1134 if row.review_status == 1: 1135 # Job has already been reviewed 1136 r.error(501, self.ERROR.JOB_COMPLETE) 1137 1138 try: 1139 r.vars.pop("_utc_offset") 1140 except: 1141 pass 1142 1143 try: 1144 ignore_fields = r.vars.pop("ignore-fields-list") 1145 except: 1146 ignore_fields = None 1147 1148 if not ignore_fields: 1149 ignore_fields = [] 1150 else: 1151 try: 1152 ignore_fields = ignore_fields.split("|") 1153 except: 1154 ignore_fields = [ignore_fields] 1155 1156 datadict = Storage() 1157 for field in r.vars.keys(): 1158 resourcetable, fieldname = field.split("-") 1159 if not datadict.has_key(resourcetable): 1160 datadict[resourcetable] = Storage() 1161 1162 datadict[resourcetable][fieldname] = r.vars[field] 1163 1164 for field in ignore_fields: 1165 resourcetable, fieldname = field.split("-") 1166 datadict[resourcetable].pop(fieldname) 1167 if len(datadict[resourcetable]) == 0: 1168 datadict.pop(resourcetable) 1169 1170 s3xml_etree_dict = Storage() 1171 for resource in datadict.keys(): 1172 s3xml_root = etree.Element("s3xml") 1173 resource_element = etree.SubElement(s3xml_root, "resource") 1174 resource_element.attrib["name"] = resource 1175 1176 for field in datadict[resource].keys(): 1177 fieldvalue = datadict[resource][field] 1178 fieldvalue = str(fieldvalue) if fieldvalue else "" 1179 fieldtype = db[resource][field].type 1180 if fieldtype.startswith("reference "): 1181 reference_resource_name = fieldtype[len("reference "):] 1182 # reference element 1183 reference_element =\ 1184 etree.SubElement(resource_element, "reference") 1185 reference_element.attrib["field"] = field 1186 reference_element.attrib["resource"] = reference_resource_name 1187 # resource element 1188 ref_res_element =\ 1189 etree.SubElement(reference_element, "resource") 1190 ref_res_element.attrib["name"] = reference_resource_name 1191 # data element 1192 ref_res_data_element =\ 1193 etree.SubElement(ref_res_element, "data") 1194 ref_res_data_element.attrib["field"] = "name" 1195 try: 1196 ref_res_data_element.text = cast2ascii(fieldvalue) 1197 except(ValueError): 1198 ref_res_data_element.text = "" 1199 else: 1200 field_element = etree.SubElement(resource_element, "data") 1201 field_element.attrib["field"] = field 1202 try: 1203 field_element.attrib["value"] = cast2ascii(fieldvalue) 1204 except(ValueError): 1205 field_element.attrib["value"] = "" 1206 try: 1207 field_element.text = cast2ascii(fieldvalue) 1208 except(ValueError): 1209 field_element.text = "" 1210 1211 s3xml_etree_dict[resource] = s3xml_root 1212 1213 errordict = {} 1214 1215 _record = current.xml.record 1216 s3record_dict = Storage() 1217 for tablename in s3xml_etree_dict.keys(): 1218 record = _record(db[tablename], 1219 s3xml_etree_dict[tablename].getchildren()[0]) 1220 s3record_dict[tablename] = record 1221 1222 import_job = r.resource.import_tree(None, None, job_id=jobuuid, 1223 ignore_errors=False, 1224 commit_job=False) 1225 1226 response.headers["Content-Type"] = contenttype(".json") 1227 1228 for tablename in s3record_dict.keys(): 1229 record = s3record_dict[tablename] 1230 possible_items = [] 1231 our_item = None 1232 for eachitem in import_job.items.keys(): 1233 item = import_job.items[eachitem] 1234 if item.table == tablename: 1235 if item.data and (len(item.data) > 0): 1236 our_item = item 1237 else: 1238 if item.data and (len(item.data) == 0): 1239 possible_items.append(item) 1240 1241 if our_item: 1242 our_item.update(record) 1243 elif len(possible_items) > 0: 1244 possible_items[0].update(record) 1245 else: 1246 import_job.add_item(s3xml_etree_dict[tablename].getchildren()[0]) 1247 1248 for resourcename in datadict.keys(): 1249 table = db[resourcename] 1250 for field in datadict[resourcename].keys(): 1251 if not table[field].type.startswith("reference "): 1252 value, error = s3_validate(table, 1253 field, 1254 datadict[resourcename][field]) 1255 if error: 1256 errordict["%s-%s" % (resourcename, field)] = str(error) 1257 1258 if not import_job.error_tree: 1259 store_success = import_job.store() 1260 if store_success: 1261 if import_job.error_tree: 1262 errordict = self.__parse_job_error_tree(import_job.error_tree) 1263 success = False 1264 else: 1265 # Revalidate data 1266 for resourcename in datadict.keys(): 1267 table = db[resourcename] 1268 for field in datadict[resourcename].keys(): 1269 if not table[field].type.startswith("reference "): 1270 value, error =\ 1271 s3_validate(table, 1272 field, 1273 datadict[resourcename][field]) 1274 if error: 1275 errordict["%s-%s" % (resourcename, field)] = str(error) 1276 1277 if len(errordict) > 0: 1278 success = False 1279 else: 1280 success = True 1281 import_job.commit() 1282 1283 else: 1284 errordict = self.__parse_job_error_tree(import_job.error_tree) 1285 success = False 1286 else: 1287 errordict = self.__parse_job_error_tree(import_job.error_tree) 1288 success = False 1289 1290 if success: 1291 session.confirmation =\ 1292 T("OCR review data has been stored into the database successfully.") 1293 1294 # Perform cleanup 1295 statustable = db["ocr_form_status"] 1296 query = (statustable.job_uuid == jobuuid) 1297 row = db(query).select(statustable.image_set_uuid).first() 1298 image_set_uuid = row.image_set_uuid 1299 1300 # Set review status = true 1301 db(query).update(review_status=1) 1302 1303 # Remove cropped images from the database 1304 cropstable = db.ocr_field_crops 1305 query = (cropstable.image_set_uuid == image_set_uuid) 1306 1307 # Delete uploaded files 1308 rows = db(query).select(cropstable.image_file) 1309 for row in rows: 1310 filename = row.image_file 1311 filepath = os.path.join(self.r.folder, 1312 "uploads", 1313 "ocr_payload", 1314 filename) 1315 os.remove(filepath) 1316 1317 # Delete records 1318 db(query).delete() 1319 1320 return json.dumps({"success": success, 1321 "error": errordict}) 1322 1323 else: 1324 r.error(405, current.ERROR.BAD_METHOD) 1325 1326 else: 1327 r.error(501, current.ERROR.BAD_REQUEST)

1328 1329 # -------------------------------------------------------------------------

1330 - def __parse_job_error_tree(self, tree):

1331 """ 1332 create a dictionary of fields with errors 1333 1334 @param tree: S3ImportJob.error_tree 1335 @return: errordict 1336 """ 1337 1338 errordict = {} 1339 1340 for resource in tree: 1341 resourcename = resource.attrib.get("name") 1342 for field in resource: 1343 fieldname = field.attrib.get("field") 1344 error = field.attrib.get("error") 1345 if error: 1346 errordict["%s-%s" % (resourcename, fieldname)] = error 1347 1348 return errordict

1349 1350 # -------------------------------------------------------------------------

1351 - def dameraulevenshtein(self, seq1, seq2):

1352 """ 1353 Calculate the Damerau-Levenshtein distance between sequences. 1354 1355 This distance is the number of additions, deletions, substitutions, 1356 and transpositions needed to transform the first sequence into the 1357 second. Although generally used with strings, any sequences of 1358 comparable objects will work. 1359 1360 Transpositions are exchanges of *consecutive* characters; all other 1361 operations are self-explanatory. 1362 1363 This implementation is O(N*M) time and O(M) space, for N and M the 1364 lengths of the two sequences. 1365 1366 >>> dameraulevenshtein('ba', 'abc') 1367 2 1368 >>> dameraulevenshtein('fee', 'deed') 1369 2 1370 1371 It works with arbitrary sequences too: 1372 >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e']) 1373 2 1374 """ 1375 1376 # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F 1377 # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix. 1378 # However, only the current and two previous rows are needed at once, 1379 # so we only store those. 1380 oneago = None 1381 thisrow = range(1, len(seq2) + 1) + [0] 1382 for x in xrange(len(seq1)): 1383 # Python lists wrap around for negative indices, so put the 1384 # leftmost column at the *end* of the list. This matches with 1385 # the zero-indexed strings and saves extra calculation. 1386 twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] 1387 for y in xrange(len(seq2)): 1388 delcost = oneago[y] + 1 1389 addcost = thisrow[y - 1] + 1 1390 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) 1391 thisrow[y] = min(delcost, addcost, subcost) 1392 # This block deals with transpositions 1393 if (x > 0 and y > 0 and seq1[x] == seq2[y - 1] 1394 and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]): 1395 thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) 1396 return thisrow[len(seq2) - 1]

1397 1398 # -------------------------------------------------------------------------

1399 - def __temp_ocrdataxml_parser(self, s3ocrdataxml):

1400 """ 1401 convert data generated from ocr parser to a dictionary 1402 1403 @param s3dataxml: output of S3OCRImageParser 1404 1405 @return: python dictionary equalant to the input xml 1406 """ 1407 1408 s3ocrdataxml_etree = etree.fromstring(s3ocrdataxml) 1409 s3ocrdatadict = Storage() 1410 1411 s3xml_root = s3ocrdataxml_etree 1412 resource_element = s3xml_root.getchildren()[0] 1413 s3ocr_root = etree.Element("s3ocr") 1414 1415 if self.r.component: # if it is a component 1416 s3ocr_root.append(resource_element) 1417 1418 else: # if it is main resource 1419 componentetrees = [] 1420 # mres is main resource etree 1421 mres = etree.Element("resource") 1422 for attr in resource_element.attrib.keys(): 1423 mres.set(attr, resource_element.attrib.get(attr)) 1424 for field_element in resource_element: 1425 if field_element.tag in ["data", "reference"]: # main resource fields 1426 mres.append(field_element) 1427 elif field_element.tag == "resource": # component resource 1428 componentetrees.append(field_element) 1429 1430 serialised_component_etrees = componentetrees 1431 1432 # create s3ocr tree 1433 s3ocr_root.append(mres) 1434 for res in serialised_component_etrees: 1435 s3ocr_root.append(res) 1436 1437 for resource in s3ocr_root: 1438 resourcename = resource.attrib.get("name") 1439 s3ocrdatadict[resourcename] = Storage() 1440 for field in resource: 1441 if field.tag == "reference": 1442 fieldname = field.attrib.get("field") 1443 ref_res_field = field.getchildren()[0] 1444 datafield = ref_res_field.getchildren()[0] 1445 value = datafield.text 1446 1447 else: 1448 fieldname = field.attrib.get("field") 1449 value = field.attrib.get("value") 1450 text = field.text 1451 if not value: 1452 value = text 1453 1454 s3ocrdatadict[resourcename][fieldname] = value 1455 return s3ocrdatadict

1456 1457 # -------------------------------------------------------------------------

1458 - def __importjob2data(self, importjob):

1459 """ 1460 convert data from import job into a dictionary 1461 1462 @param importjob: S3ImportJob instance 1463 1464 @return: data of S3ImportJob into a dictionary 1465 """ 1466 1467 s3ocrdata = Storage() 1468 1469 import_item_dict = importjob.items 1470 for eachitem in import_item_dict.keys(): 1471 import_item = import_item_dict[eachitem] 1472 if import_item.data and len(import_item.data) > 0: 1473 s3ocrdata[str(import_item.table)] = import_item.data 1474 1475 return s3ocrdata

1476 1477 # -------------------------------------------------------------------------

1478 - def __create_review_form(self, s3ocrdict, s3ocrdata):

1479 """ 1480 create a html review form using the available data 1481 1482 @param s3ocrdict: output of self.__s3ocrxml2dict() 1483 @param s3ocrdata: output of self.__importjob2data() 1484 1485 @return: html review form 1486 """ 1487 1488 ptablecontent = [] 1489 fieldnum = 1 1490 request = current.request 1491 T = current.T 1492 r = self.r 1493 setuuid = self.setuuid 1494 if r.component: 1495 request_args = request.get("args",["",""]) 1496 record_id = request_args[0] 1497 component_name = request_args[1] 1498 urlprefix = "%s/%s/%s" % (request.function, 1499 record_id, 1500 component_name) 1501 else: 1502 # Not a component 1503 urlprefix = request.function 1504 1505 for resourcename in s3ocrdict["$resource_seq"]: 1506 # Resource title 1507 resource = s3ocrdict[resourcename] 1508 ptablecontent.append(TR(TD(DIV(resourcename, _class="resource_name"), 1509 _colspan="4"), 1510 _class="titletr") 1511 ) 1512 1513 ctablecontent = [] 1514 for fieldname in resource["$field_seq"]: 1515 field = resource[fieldname] 1516 comment = field.comment if field.comment else "" 1517 1518 try: 1519 ocrdata = s3ocrdata[resourcename][fieldname] 1520 if ocrdata: 1521 condition = (isinstance(ocrdata, str) or \ 1522 isinstance(ocrdata, int)) 1523 if condition: 1524 value = str(ocrdata) 1525 elif isinstance(ocrdata, date): 1526 value = date.strftime(ocrdata, "%Y-%m-%d") 1527 elif isinstance(ocrdata, datetime): 1528 value = datetime.strftime(ocrdata, "%Y-%m-%d %H:%M:%S") 1529 else: 1530 value = unicodedata.normalize("NFKD", 1531 ocrdata).encode("ascii", 1532 "ignore") 1533 else: 1534 value = "" 1535 except(KeyError): 1536 value="" 1537 1538 name = "%s-%s" % (resourcename, fieldname) 1539 1540 if field.has_options: 1541 if field.type == "multiselect": 1542 if field.options.count <= MAX_FORM_OPTIONS_LIMIT: 1543 options = [] 1544 optct = 1 1545 try: 1546 value = value.split("|")[1:-1] 1547 except: 1548 value = [str(value)] 1549 chk = lambda m, n: "on" if str(m) in n else None 1550 for option in field.options.list: 1551 options.append(TD(IMG(_src=URL(request.application, 1552 r.prefix, 1553 "%s/upload.pdf" % urlprefix, 1554 args="image", 1555 vars={"setuuid": setuuid, 1556 "resource_table": resourcename, 1557 "field_name": fieldname, 1558 "value": option.value 1559 } 1560 ), 1561 _style="border: solid #333 1px;"), 1562 _style="text-align:center;"), 1563 TD(INPUT(_id="%s-%s" % (name, optct), 1564 _value=option.value, 1565 _type="checkbox", 1566 _class="field-%s" % fieldnum, 1567 _name=name, 1568 value=chk(option.value, 1569 value))), 1570 TD(LABEL(option.label, 1571 _for="%s-%s" % (name, optct)))) 1572 optct += 1 1573 input_area = TABLE(TR(options), 1574 _class="field-%s" % fieldnum) 1575 1576 else: 1577 for line in xrange(1, 3): 1578 ctablecontent.append(TR(TD(IMG(_src=URL(request.application, 1579 r.prefix, 1580 "%s/upload.pdf" % urlprefix, 1581 args="image", 1582 vars={"setuuid": setuuid, 1583 "resource_table": resourcename, 1584 "field_name": fieldname, 1585 "sequence": line 1586 } 1587 ), 1588 _style="border: solid #333 1px;"), 1589 _style="text-align:center; padding:5px;", 1590 _colspan="4"))) 1591 1592 options = [] 1593 optct = 1 1594 1595 chk = lambda m, n: "on" if str(m) in n else None 1596 for option in field.options.list: 1597 options.append(TR(TD(INPUT(_id="%s-%s" % (name, optct), 1598 _value=option.value, 1599 _type="checkbox", 1600 _class="field-%s" % fieldnum, 1601 _name=name, 1602 value=chk(option.value, 1603 value) 1604 )), 1605 TD(LABEL(option.label, 1606 _for="%s-%s" % (name, optct))))) 1607 optct += 1 1608 input_area = TABLE(options, 1609 _class="field-%s" % fieldnum) 1610 1611 elif field.type == "boolean": 1612 options = [] 1613 optct = 1 1614 chk = lambda m, n: m if str(m) == str(n) else None 1615 for option in [Storage({"value": "yes", 1616 "label": T("Yes")}), 1617 Storage({"value": "no", 1618 "label": T("No")})]: 1619 options.append(TD(IMG(_src=URL(request.application, 1620 r.prefix, 1621 "%s/upload.pdf" % urlprefix, 1622 args="image", 1623 vars={"setuuid": setuuid, 1624 "resource_table": resourcename, 1625 "field_name": fieldname, 1626 "value": option.value 1627 } 1628 ), 1629 _style="border: solid #333 1px;"), 1630 _style="text-align:center;"), 1631 TD(INPUT(_id="%s-%s" % (name, optct), 1632 _value=option.value, 1633 _type="radio", 1634 _class="field-%s" % fieldnum, 1635 _name=name, 1636 value=chk(option.value, 1637 value))), 1638 TD(LABEL(option.label, 1639 _for="%s-%s" % (name, optct)))) 1640 optct += 1 1641 input_area = TABLE(TR(options), 1642 _class="field-%s" % fieldnum) 1643 1644 else: 1645 if field.options.count <= MAX_FORM_OPTIONS_LIMIT: 1646 options = [] 1647 optct = 1 1648 chk = lambda m, n: m if str(m) == str(n) else None 1649 for option in field.options.list: 1650 options.append(TD(IMG(_src=URL(request.application, 1651 r.prefix, 1652 "%s/upload.pdf" % urlprefix, 1653 args="image", 1654 vars={"setuuid": setuuid, 1655 "resource_table": resourcename, 1656 "field_name": fieldname, 1657 "value": option.value 1658 } 1659 ), 1660 _style="border: solid #333 1px;"), 1661 _style="text-align:center;"), 1662 TD(INPUT(_id="%s-%s" % (name, optct), 1663 _value=option.value, 1664 _type="radio", 1665 _class="field-%s" % fieldnum, 1666 _name=name, 1667 value=chk(option.value, 1668 value))), 1669 TD(LABEL(option.label, 1670 _for="%s-%s" % (name, optct)))) 1671 optct += 1 1672 input_area = TABLE(TR(options), 1673 _class="field-%s" % fieldnum) 1674 1675 else: 1676 for line in xrange(1, 3): 1677 ctablecontent.append(TR(TD(IMG(_src=URL(request.application, 1678 r.prefix, 1679 "%s/upload.pdf" % urlprefix, 1680 args="image", 1681 vars={"setuuid": setuuid, 1682 "resource_table": resourcename, 1683 "field_name": fieldname, 1684 "sequence": line 1685 } 1686 ), 1687 _style="border: solid #333 1px;"), 1688 _style="text-align:center; padding:5px;", 1689 _colspan="4"))) 1690 1691 options = [] 1692 optct = 1 1693 chk = lambda m, n: m if str(m) == str(n) else None 1694 for option in field.options.list: 1695 options.append(TR(TD(INPUT(_id="%s-%s" % (name, optct), 1696 _value=option.value, 1697 _type="radio", 1698 _class="field-%s" % fieldnum, 1699 _name=name, 1700 value=chk(option.value, 1701 value) 1702 )), 1703 TD(LABEL(option.label, 1704 _for="%s-%s" % (name, optct))))) 1705 optct += 1 1706 input_area = TABLE(options, 1707 _class="field-%s" % fieldnum) 1708 1709 else: 1710 if field.type in ["string", "integer", "double"]: 1711 for line in xrange(1, field.lines + 1): 1712 ctablecontent.append(TR(TD(IMG(_src=URL(request.application, 1713 r.prefix, 1714 "%s/upload.pdf" % urlprefix, 1715 args="image", 1716 vars={"setuuid": setuuid, 1717 "resource_table": resourcename, 1718 "field_name": fieldname, 1719 "sequence": line 1720 } 1721 ), 1722 _style="border: solid #333 1px;"), 1723 _style="text-align:center; padding:5px;", 1724 _colspan="4"))) 1725 input_area = INPUT(_id="%s-id" % name.replace("-", "_"), 1726 _class="field-%s" % fieldnum, 1727 _value=value, _name=name) 1728 1729 elif field.type == "date": 1730 subsec = {"DD":1, 1731 "MO":2, 1732 "YYYY":3} 1733 imglist = [] 1734 for sec in ["YYYY", "MO", "DD"]: 1735 imglist.append(IMG(_src=URL(request.application, 1736 r.prefix, 1737 "%s/upload.pdf" % urlprefix, 1738 args="image", 1739 vars={"setuuid": setuuid, 1740 "resource_table": resourcename, 1741 "field_name": field, 1742 "sequence": subsec[sec]} 1743 ), 1744 _style="border: solid #333 1px;")) 1745 ctablecontent.append(TR(TD(imglist, 1746 _style="text-align:center; padding:5px;", 1747 _colspan="4"))) 1748 1749 try: 1750 value = value.strftime("%Y-%m-%d") 1751 except(AttributeError): 1752 try: 1753 value = datetime.strptime(value, "%Y-%m-%d") 1754 value = value.strftime("%Y-%m-%d") 1755 except(ValueError): 1756 value = "" 1757 input_area = INPUT(_id="%s-id" % name.replace("-", "_"), 1758 _class="field-%s date" % fieldnum, 1759 _value=value, _name=name) 1760 1761 elif field.type == "datetime": 1762 subsec = {"HH":1, 1763 "MM":2, 1764 "DD":3, 1765 "MO":4, 1766 "YYYY":5} 1767 imglist = [] 1768 for eachsec in ["YYYY", "MO", "DD", "HH", "MM"]: 1769 imglist.append(IMG(_src=URL(request.application, 1770 r.prefix, 1771 "%s/upload.pdf" % urlprefix, 1772 args="image", 1773 vars={"setuuid": setuuid, 1774 "resource_table": resourcename, 1775 "field_name": fieldname, 1776 "sequence": subsec[eachsec], 1777 } 1778 ), 1779 _style="border: solid #333 1px;")) 1780 ctablecontent.append(TR(TD(imglist, 1781 _style="text-align:center; padding:5px;", 1782 _colspan="4"))) 1783 1784 try: 1785 value = value.strftime("%Y-%m-%d %H:%M:%S") 1786 except(AttributeError): 1787 try: 1788 value = datetime.strptime(value,"%Y-%m-%d %H:%M:%S") 1789 value = value.strftime("%Y-%m-%d %H:%M:%S") 1790 except(ValueError): 1791 value = "" 1792 1793 input_area = INPUT(_id="%s-id" % name.replace("-", "_"), 1794 _class="field-%s datetime" % fieldnum, 1795 _value=value, _name=name) 1796 1797 elif field.type == "textbox": 1798 for line in xrange(1, field.lines + 1): 1799 ctablecontent.append(TR(TD(IMG(_src=URL(request.application, 1800 r.prefix, 1801 "%s/upload.pdf" % urlprefix, 1802 args="image", 1803 vars={"setuuid": setuuid, 1804 "resource_table": resourcename, 1805 "field_name": fieldname, 1806 "sequence": line 1807 } 1808 ), 1809 _style="border: solid #333 1px;"), 1810 _style="text-align:center; padding:5px;", 1811 _colspan="4"))) 1812 input_area = TEXTAREA(value, 1813 _class="field-%s" % fieldnum, 1814 _name=name) 1815 1816 else: 1817 input_area = SPAN() 1818 1819 ctablecontent.append(TR(TD(TABLE(TR(TD(field.label)), 1820 TR(TD(SPAN(_id="%s-error" % name, 1821 _style="font-size: 12px; font-weight:bold; color: red;", 1822 _class="error-span")))), 1823 _class="label", _style="vertical-align: top;"), 1824 TD(input_area, _class="infield"), 1825 TD(comment, _class="comment", _style="vertical-align: top;"), 1826 TD(TAG["BUTTON"](T("clear"), 1827 _name="button-%s" % fieldnum, 1828 _class="clrbutton" 1829 ), 1830 TAG["BUTTON"](T("ignore"), 1831 _name="ignore-%s" % name, 1832 _class="ignore-button"), 1833 _class="clear", _style="vertical-align: top;"), 1834 _class="fieldtr")) 1835 1836 ctablecontent.append(TR(TD(_colspan="4", 1837 _style="border: solid #999 3px;"))) 1838 fieldnum += 1 1839 1840 ptablecontent.extend(ctablecontent) 1841 1842 # Submit button 1843 ptablecontent.append(TR(TD(TAG["button"](T("Submit"), 1844 _class="submit-button", 1845 _style="width: 70px; height: 20px;"), 1846 _colspan="4", 1847 _style="text-align:center; padding: 5px;"))) 1848 1849 output = FORM(TABLE(ptablecontent, _class="ptable"), 1850 _id="ocr-review-form") 1851 1852 return output

1853 1854 # -------------------------------------------------------------------------

1855 - def __s3ocrxml2dict(self, s3ocrxml):

1856 """ 1857 convert s3ocrxml to dictionary so that it can be used in templates 1858 1859 @param s3ocrxml: content of a s3ocrxml file, in text 1860 1861 @return: equivalent dictionary for s3ocrxml file 1862 """ 1863 1864 db = current.db 1865 s3ocr_etree = etree.fromstring(s3ocrxml) 1866 s3ocrdict = Storage() 1867 resource_seq = [] 1868 1869 for resource in s3ocr_etree: 1870 resourcename = resource.attrib.get("name") 1871 table = db[resourcename] 1872 s3ocrdict[resourcename] = Storage() 1873 resource_seq.append(resourcename) 1874 field_seq = [] 1875 for field in resource: 1876 get = field.attrib.get 1877 fieldname = get("name") 1878 1879 if get("readable") == "True" and \ 1880 get("writable") == "True": 1881 1882 field_seq.append(fieldname) 1883 1884 fieldlabel = get("label") 1885 fieldtype = get("type") 1886 numlines = get("lines", "1") 1887 1888 if get("reference") == "1": 1889 fieldreference = True 1890 else: 1891 fieldreference = False 1892 fieldresource = get("resource") 1893 if get("has_options") == "True": 1894 fieldhasoptions = True 1895 else: 1896 fieldhasoptions = False 1897 1898 # Get html comment 1899 fieldcomment = table[fieldname].comment 1900 1901 if fieldhasoptions: 1902 try: 1903 s3ocrselect = field.getchildren()[0] 1904 options_found = True 1905 except(IndexError): 1906 fieldoptions = None 1907 options_found = False 1908 1909 if options_found: 1910 1911 numoptions = len(s3ocrselect.getchildren()) 1912 optionlist = [] 1913 1914 for option in s3ocrselect: 1915 optionlabel = option.text 1916 optionvalue = option.attrib.get("value") 1917 optionlist.append(Storage({"label": optionlabel, 1918 "value": optionvalue})) 1919 1920 fieldoptions = Storage({"count": numoptions, 1921 "list": optionlist}) 1922 1923 else: 1924 fieldoptions = None 1925 else: 1926 fieldoptions = None 1927 1928 s3ocrdict[resourcename][fieldname] = Storage({"label": fieldlabel, 1929 "type": fieldtype, 1930 "comment": fieldcomment, 1931 "reference": fieldreference, 1932 "resource": fieldresource, 1933 "has_options": fieldhasoptions, 1934 "options": fieldoptions, 1935 "lines": int(numlines) 1936 }) 1937 s3ocrdict[resourcename]["$field_seq"] = field_seq 1938 1939 s3ocrdict["$resource_seq"] = resource_seq 1940 1941 return s3ocrdict

1942 1943 # -------------------------------------------------------------------------

1944 - def newDocument(self, 1945 title, 1946 header, 1947 footer, 1948 filename = None, 1949 heading=None, 1950 ):

1951 """ 1952 This will create a new empty PDF document. 1953 Data then needs to be added to this document. 1954 1955 @param title: The title that will appear at the top of the document 1956 and in the filename 1957 1958 @return: An empty pdf document 1959 """ 1960 1961 # Get the document variables 1962 now = self.request.now.isoformat()[:19].replace("T", " ") 1963 docTitle = "%s %s" % (title, now) 1964 if filename == None: 1965 self.filename = "%s_%s.pdf" % (title, now) 1966 else: 1967 self.filename = "%s_%s.pdf" % (filename, now) 1968 self.output = StringIO() 1969 self.doc = EdenDocTemplate(self.output, title=docTitle) 1970 self.doc.setPageTemplates(header,footer) 1971 self.content = [] 1972 if heading == None: 1973 heading = title 1974 self.title = heading 1975 self.prevtitle = heading 1976 self.setPortrait() 1977 self.leftMargin = 0.4 * inch 1978 self.rightMargin = 0.4 * inch 1979 self.topMargin = 0.4 * inch 1980 self.bottomMargin = 0.4 * inch 1981 self.MINIMUM_MARGIN_SIZE = 0.3 * inch 1982 self.setMargins()

1983 1984 # -------------------------------------------------------------------------

1985 - def newOCRForm(self, 1986 formUUID, 1987 pdfname="ocrform.pdf", 1988 top=65, 1989 left=50, 1990 bottom=None, 1991 right=None, 1992 **args):

1993 1994 self.content = [] 1995 self.output = StringIO() 1996 self.layoutEtree = etree.Element("s3ocrlayout") 1997 try: 1998 pdfTitle = current.response.s3.crud_strings[self.tablename].label_create.decode("utf-8") 1999 except: 2000 pdfTitle = self.resource.tablename 2001 2002 formResourceName = self.resource.tablename 2003 formRevision = self.__book_revision(formUUID, formResourceName) 2004 self.filename = "%s_rev%s.pdf" % (formResourceName, formRevision) 2005 self.doc = self.S3PDFOCRForm(self.output, 2006 formUUID=formUUID, 2007 pdfTitle = pdfTitle, 2008 formRevision=formRevision, 2009 formResourceName=formResourceName)

2010 2011 # -------------------------------------------------------------------------

2012 - def __getResourceForms(self):

2013 """ 2014 Get all form UUIDs/Revs available for a given resource 2015 2016 @return: a list of all available forms for the given 2017 resource, the list will contain tuples such 2018 that the first value is form-uuid and the 2019 second value is form-revision 2020 """ 2021 2022 db = current.db 2023 table = db.ocr_meta 2024 query = (table.resource_name == self.resource.tablename) 2025 rows = db(query).select(table.form_uuid, 2026 table.revision, 2027 orderby=~table.revision) 2028 availForms = [] 2029 append = availForms.append 2030 for row in rows: 2031 append({"uuid" : row.form_uuid, 2032 "revision": row.revision, 2033 }) 2034 return availForms

2035 2036 # -------------------------------------------------------------------------

2037 - def __getNumPages(self, formuuid):

2038 """ 2039 Gets Number of pages for given form UUID 2040 2041 @param formuuid: uuid of the form, for which 2042 number of pages is required 2043 2044 @return: number of pages in a form identified 2045 by uuid 2046 """ 2047 2048 db = current.db 2049 table = db.ocr_meta 2050 row = db(table.form_uuid == formuuid).select(table.pages, 2051 limitby=(0, 1) 2052 ).first() 2053 return int(row.pages)

2054 2055 # -------------------------------------------------------------------------

2056 - def __s3OCREtree(self):

2057 """ 2058 Optimise & Modifiy s3xml etree to and produce s3ocr etree 2059 2060 @return: s3ocr etree 2061 """ 2062 2063 r = self.r 2064 2065 s3xml_etree = self.resource.export_struct(options=True, 2066 references=True, 2067 stylesheet=None, 2068 as_json=False, 2069 as_tree=True) 2070 2071 # Additional XML tags 2072 ITEXT = "label" 2073 HINT = "comment" 2074 TYPE = "type" 2075 HASOPTIONS = "has_options" 2076 LINES = "lines" 2077 BOXES = "boxes" 2078 REFERENCE = "reference" 2079 RESOURCE = "resource" 2080 2081 # Components Localised Text added to the etree 2082 # Convering s3xml to s3ocr_xml (nicer to traverse) 2083 s3xml_root = s3xml_etree.getroot() 2084 resource_element = s3xml_root.getchildren()[0] 2085 s3ocr_root = etree.Element("s3ocr") 2086 2087 # Store components which have to be excluded 2088 settings = current.deployment_settings 2089 self.exclude_component_list =\ 2090 settings.get_pdf_excluded_fields("%s_%s" % \ 2091 (r.prefix, 2092 r.resource.name)) 2093 2094 if r.component: # if it is a component 2095 s3ocr_root.append(resource_element) 2096 2097 else: # if it is main resource 2098 componentetrees = [] 2099 # mres is main resource etree 2100 mres = etree.Element("resource") 2101 for attr in resource_element.attrib.keys(): 2102 mres.set(attr, resource_element.attrib.get(attr)) 2103 for field_element in resource_element: 2104 if field_element.tag == "field": # main resource fields 2105 mres.append(field_element) 2106 elif field_element.tag == "resource": # component resource 2107 componentetrees.append(field_element) 2108 2109 serialised_component_etrees = componentetrees 2110 2111 # create s3ocr tree 2112 s3ocr_root.append(mres) 2113 for res in serialised_component_etrees: 2114 s3ocr_root.append(res) 2115 2116 # Database fieldtype to ocr fieldtype mapping 2117 self.generic_ocr_field_type = { 2118 "string": "string", 2119 "text": "textbox", 2120 "boolean" : "boolean", 2121 "double": "double", 2122 "date": "date", 2123 "datetime": "datetime", 2124 "integer": "integer", 2125 "list:integer": "multiselect", 2126 "list:string": "multiselect", 2127 "list:double": "multiselect", 2128 "list:text": "multiselect", 2129 } 2130 2131 # Remove fields which are not required 2132 # Load user-defined configurations 2133 FIELD_TYPE_LINES = { # mapping types with number of lines 2134 "string": 1, 2135 "textbox": 2, 2136 "integer": 1, 2137 "double": 1, 2138 "date": 1, 2139 "datetime": 1, 2140 } 2141 FIELD_TYPE_BOXES = { # mapping type with numboxes 2142 "integer": 8, 2143 "double": 16, 2144 } 2145 for resource in s3ocr_root.iterchildren(): 2146 rget = resource.attrib.get 2147 resourcetablename = rget("name") 2148 2149 # Exclude components 2150 if not r.component: 2151 if rget("name") in self.exclude_component_list: 2152 s3ocr_root.remove(resource) 2153 continue 2154 2155 if "alias" in resource.attrib: 2156 alias = resource.attrib["alias"] 2157 elif "_" in resourcetablename: 2158 alias = resourcetablename.split("_", 1)[1] 2159 else: 2160 alias = resourcetablename 2161 2162 if alias == self.resource.alias and \ 2163 resourcetablename == self.resource.tablename: 2164 fieldresource = self.resource 2165 elif alias in self.resource.components: 2166 fieldresource = self.resource.components[alias] 2167 else: 2168 continue 2169 2170 for field in resource.iterchildren(): 2171 get = field.attrib.get 2172 set = field.set 2173 fieldname = get("name") 2174 # Fields which have to be displayed 2175 fieldtype = get(TYPE) 2176 2177 if fieldtype.startswith("reference "): 2178 set(RESOURCE, fieldtype.split("reference ")[1]) 2179 set(REFERENCE, "1") 2180 else: 2181 set(REFERENCE, "0") 2182 2183 # Load OCR-specific fieldtypes 2184 ocrfieldtype = self.generic_ocr_field_type.get(fieldtype, None) 2185 if ocrfieldtype != None: 2186 set(TYPE, ocrfieldtype) 2187 # Refresh fieldtypes after update 2188 fieldtype = get(TYPE) 2189 2190 # Set num boxes and lines 2191 fieldhasoptions = get(HASOPTIONS) 2192 if fieldhasoptions == "False": 2193 set(LINES, str(FIELD_TYPE_LINES.get(fieldtype, 1))) 2194 if fieldtype in FIELD_TYPE_BOXES.keys(): 2195 set(BOXES, str(FIELD_TYPE_BOXES.get(fieldtype))) 2196 2197 # If field is readable but not writable set default value 2198 if get("readable", "False") == "True" and \ 2199 get("writable", "False") == "False": 2200 2201 fieldname = get("name") 2202 try: 2203 fielddefault = fieldresource.table[fieldname].default 2204 except(KeyError): 2205 fielddefault = "None" 2206 set("default", str(fielddefault)) 2207 2208 # For unknown field types 2209 if fieldtype not in self.generic_ocr_field_type.values(): 2210 set(TYPE, "string") 2211 set(HASOPTIONS, "False") 2212 set(LINES, "2") 2213 # Refresh fieldtypes after update 2214 fieldtype = get(TYPE) 2215 2216 # In OCR, boolean fields should be shown as options 2217 if fieldtype == "boolean": 2218 set(HASOPTIONS, "True") 2219 2220 # Fields removed which need not be displayed 2221 if get("readable", "False") == "False" and \ 2222 get("writable", "False") == "False": 2223 resource.remove(field) 2224 continue 2225 2226 if get(HASOPTIONS, "False") == "True" and \ 2227 get(TYPE) != "boolean": 2228 s3ocrselect = field.getchildren()[0] 2229 for option in s3ocrselect.iterchildren(): 2230 if option.text == "" or option.text == None: 2231 s3ocrselect.remove(option) 2232 continue 2233 2234 return s3ocr_root

2235 2236 # -------------------------------------------------------------------------

2237 - def OCRPDFManager(self):

2238 """ 2239 Produces OCR Compatible PDF forms 2240 """ 2241 2242 T = current.T 2243 s3ocr_root = self.__s3OCREtree() # get element s3xml 2244 self.s3ocrxml = etree.tostring(s3ocr_root, pretty_print=DEBUG) 2245 self.content = [] 2246 s3ocr_layout_etree = self.layoutEtree 2247 2248 # @ToDo: Define font sizes centrally rather than in flowables 2249 #titlefontsize = 16 2250 #sectionfontsize = 14 2251 #regularfontsize = 12 2252 #hintfontsize = 10 2253 2254 ITEXT = "label" 2255 HINT = "comment" 2256 TYPE = "type" 2257 HASOPTIONS = "has_options" 2258 LINES = "lines" 2259 BOXES = "boxes" 2260 REFERENCE = "reference" 2261 RESOURCE = "resource" 2262 2263 dtformat = current.deployment_settings.get_L10n_datetime_format() 2264 if str(dtformat)[:2] == "%m": 2265 # US-style 2266 date_hint = T("fill in order: month(2) day(2) year(4)") 2267 datetime_hint = T("fill in order: hour(2) min(2) month(2) day(2) year(4)") 2268 else: 2269 # ISO-style 2270 date_hint = T("fill in order: day(2) month(2) year(4)") 2271 datetime_hint = T("fill in order: hour(2) min(2) day(2) month(2) year(4)") 2272 l10n = { 2273 "datetime_hint": { 2274 "date": date_hint, 2275 "datetime": datetime_hint, 2276 }, 2277 "boolean": { 2278 "yes": T("Yes"), 2279 "no": T("No"), 2280 }, 2281 "select": { 2282 "multiselect": T("Select one or more option(s) that apply"), 2283 "singleselect": T("Select the option that applies"), 2284 }, 2285 } 2286 2287 # Print the etree 2288 append = self.content.append 2289 SubElement = etree.SubElement 2290 for resource in s3ocr_root: 2291 name = resource.attrib.get("name") 2292 # Create resource element of ocr layout xml 2293 s3ocr_layout_resource_etree = SubElement(s3ocr_layout_etree, 2294 "resource", 2295 name=name) 2296 2297 styleSheet = getStyleSheet() 2298 # @ToDo: Check if this is needed by OCR (removed for now as ugly) 2299 #append(DrawHrLine(0.5)) 2300 #append(Paragraph(html_unescape_and_strip(resource.attrib.get(ITEXT, 2301 # name)), 2302 # styleSheet["Section"])) 2303 #append(DrawHrLine(0.5)) 2304 2305 for field in resource.iterchildren(): 2306 get = field.attrib.get 2307 # Create field element of ocr layout xml 2308 s3ocr_layout_field_etree = SubElement(s3ocr_layout_resource_etree, 2309 "field", 2310 name=get("name"), 2311 type=get("type")) 2312 2313 if get(REFERENCE) == "1": 2314 s3ocr_layout_field_etree.set(REFERENCE, "1") 2315 s3ocr_layout_field_etree.set(RESOURCE, get(RESOURCE)) 2316 2317 fieldlabel = get(ITEXT) 2318 spacing = " " * 5 2319 fieldhint = self.__trim(get(HINT)) 2320 2321 if fieldhint: 2322 append(Paragraph(html_unescape_and_strip("%s%s( %s )" % \ 2323 (fieldlabel, 2324 spacing, 2325 fieldhint)), 2326 styleSheet["Question"])) 2327 2328 else: 2329 append(Paragraph(html_unescape_and_strip(fieldlabel), 2330 styleSheet["Question"])) 2331 2332 if get("readable", "False") == "True" and \ 2333 get("writable", "False") == "False": 2334 append(Paragraph(html_unescape_and_strip(get("default", 2335 "No default Value")), 2336 styleSheet["DefaultAnswer"])) 2337 2338 # Remove the layout component of empty fields 2339 s3ocr_layout_resource_etree.remove(s3ocr_layout_field_etree) 2340 2341 elif get(HASOPTIONS) == "True": 2342 fieldtype = get(TYPE) 2343 # The field has to be shown with options 2344 if fieldtype == "boolean": 2345 bool_text = l10n.get("boolean") 2346 append(DrawOptionBoxes(s3ocr_layout_field_etree, 2347 [bool_text.get("yes").decode("utf-8"), 2348 bool_text.get("no").decode("utf-8")], 2349 ["yes", "no"])) 2350 2351 else: 2352 if fieldtype == "multiselect": 2353 option_hint = l10n.get("select").get("multiselect") 2354 else: 2355 #option_hint = l10n.get("select").get("singleselect") 2356 option_hint = None 2357 2358 s3ocrselect = field.getchildren()[0] 2359 numoptions = len(s3ocrselect.getchildren()) 2360 2361 if numoptions <= MAX_FORM_OPTIONS_LIMIT: 2362 s3ocr_layout_field_etree.attrib["limitcrossed"] = "1" 2363 if option_hint: 2364 append(DrawHintBox(option_hint.decode("utf-8"))) 2365 2366 options = s3ocrselect.iterchildren() 2367 # Only show 4 options per row 2368 opts = [] 2369 oppend = opts.append 2370 range = int(math.ceil(numoptions / 4.0)) 2371 for row in xrange(range): 2372 labels = [] 2373 lappend = labels.append 2374 values = [] 2375 vappend = values.append 2376 i = 1 2377 for option in options: 2378 label = option.text 2379 if label in opts: 2380 continue 2381 oppend(label) 2382 lappend(label) 2383 vappend(option.attrib.get("value")) 2384 if i == 4: 2385 break 2386 i += 1 2387 append(DrawOptionBoxes(s3ocr_layout_field_etree, 2388 labels, 2389 values)) 2390 else: 2391 append(DrawHintBox(T("Enter a value carefully without spelling mistakes, this field needs to match existing data.").decode("utf-8"))) 2392 for line in xrange(2): 2393 append(StringInputBoxes(numBoxes=None, 2394 etreeElem=s3ocr_layout_field_etree)) 2395 else: 2396 # It is a text field 2397 fieldtype = get(TYPE) 2398 BOXES_TYPES = ["string", "textbox", "integer", 2399 "double", "date", "datetime",] 2400 if fieldtype in BOXES_TYPES: 2401 if fieldtype in ["string", "textbox"]: 2402 #form.linespace(3) 2403 num_lines = int(get("lines", 1)) 2404 for line in xrange(num_lines): 2405 append(StringInputBoxes(numBoxes=None, 2406 etreeElem=s3ocr_layout_field_etree)) 2407 2408 elif fieldtype in ["integer", "double"]: 2409 num_boxes = int(get("boxes", 9)) 2410 append(StringInputBoxes(numBoxes=num_boxes, 2411 etreeElem=s3ocr_layout_field_etree)) 2412 2413 elif fieldtype in ["date", "datetime"]: 2414 # Print hint 2415 #hinttext = \ 2416 # l10n.get("datetime_hint").get(fieldtype).decode("utf-8") 2417 #append(DrawHintBox(hinttext)) 2418 2419 if fieldtype == "datetime": 2420 append(DateTimeBoxes(s3ocr_layout_field_etree)) 2421 elif fieldtype == "date": 2422 append(DateBoxes(s3ocr_layout_field_etree)) 2423 2424 else: 2425 self.r.error(501, current.ERROR.PARSE_ERROR) 2426 return

2427 2428 # -------------------------------------------------------------------------

2429 - def __getOCRLayout(self):

2430 """ 2431 return layout file 2432 2433 @return: layout xml for the generated OCR form 2434 """ 2435 2436 prettyprint = True if DEBUG else False 2437 return etree.tostring(self.layoutEtree, pretty_print=prettyprint)

2438 2439 # ------------------------------------------------------------------------- 2440 @staticmethod

2441 - def __trim(text):

2442 """ 2443 Helper to trim off any enclosing paranthesis 2444 2445 @param text: text which need to be trimmed 2446 2447 @return: text with front and rear paranthesis stripped 2448 """ 2449 2450 if isinstance(text, str) and \ 2451 text[0] == "(" and \ 2452 text[-1] == ")": 2453 text = text[1:-1] 2454 return text

2455 2456 # -------------------------------------------------------------------------

2457 - def __update_dbmeta(self, formUUID, layoutXML, numPages):

2458 """ 2459 Store the PDF layout information into the database/disk. 2460 2461 @param formUUID: uuid of the generated form 2462 @param layoutXML: layout xml of the generated form 2463 @param numPages: number of pages in the generated form 2464 """ 2465 2466 layout_file_stream = StringIO(layoutXML) 2467 layout_file_name = "%s_xml" % formUUID 2468 2469 s3ocrxml_file_stream = StringIO(self.s3ocrxml) 2470 s3ocrxml_file_name = "%s_ocrxml" % formUUID 2471 2472 db = current.db 2473 table = db.ocr_meta 2474 rows = db(table.form_uuid == formUUID).select() 2475 row = rows[0] 2476 row.update_record(layout_file=table.layout_file.store(\ 2477 layout_file_stream, 2478 layout_file_name), 2479 s3ocrxml_file=table.s3ocrxml_file.store(\ 2480 s3ocrxml_file_stream, 2481 s3ocrxml_file_name), 2482 pages=numPages)

2483 2484 # ------------------------------------------------------------------------- 2485 @staticmethod

2486 - def __book_revision(formUUID, formResourceName):

2487 """ 2488 Books a revision number for current operation in ocr_meta 2489 2490 @param formUUID: uuid of the generated form 2491 @param formResourceName: name of the eden resource 2492 """ 2493 2494 db = current.db 2495 table = current.s3db.ocr_meta 2496 2497 # Determine revision 2498 #selector = table["revision"].max() 2499 #rows = db(table.resource_name == formResourceName).select(selector) 2500 #row = rows.first() 2501 #revision = 0 if (row[selector] == None) else (row[selector] + 1) 2502 2503 # Make the table migratable 2504 # Take the timestamp in hex 2505 import uuid 2506 revision = uuid.uuid5(formUUID, formResourceName).hex.upper()[:6] 2507 2508 table.insert(form_uuid=formUUID, 2509 resource_name=formResourceName, 2510 revision=revision) 2511 2512 return revision

2513 2514 # ------------------------------------------------------------------------- 2515 @staticmethod

2516 - def defaultTitle(resource):

2517 """ 2518 Method to extract a generic title from the resource using the 2519 crud strings 2520 2521 @param: resource: a S3Resource object 2522 2523 @return: the title as a String 2524 """ 2525 2526 try: 2527 return current.response.s3.crud_strings.get(resource.table._tablename).get("title_list") 2528 except: 2529 # No CRUD Strings for this resource 2530 return current.T(resource.name.replace("_", " ")).decode("utf-8")

2531 2532 # -------------------------------------------------------------------------

2533 - def setMargins(self, left=None, right=None, top=None, bottom=None):

2534 """ 2535 Method to set the margins of the document 2536 2537 @param left: the size of the left margin, default None 2538 @param right: the size of the right margin, default None 2539 @param top: the size of the top margin, default None 2540 @param bottom: the size of the bottom margin, default None 2541 2542 The margin is only changed if a value is provided, otherwise the 2543 last value that was set will be used. The original values are set 2544 up to be an inch - in newDocument() 2545 2546 @todo: make this for a page rather than the document 2547 """ 2548 2549 if left != None: 2550 self.doc.leftMargin = left 2551 self.leftMargin = left 2552 else: 2553 self.doc.leftMargin = self.leftMargin 2554 if right != None: 2555 self.doc.rightMargin = right 2556 self.rightMargin = right 2557 else: 2558 self.doc.rightMargin = self.rightMargin 2559 if top != None: 2560 self.doc.topMargin = top 2561 self.topMargin = top 2562 else: 2563 self.doc.topMargin = self.topMargin 2564 if bottom != None: 2565 self.doc.bottomMargin = bottom 2566 self.bottomMargin = bottom 2567 else: 2568 self.doc.bottomMargin = self.bottomMargin

2569 2570 # -------------------------------------------------------------------------

2571 - def setPortrait(self):

2572 """ 2573 Method to set the orientation of the document to be portrait 2574 2575 @todo: make this for a page rather than the document 2576 """ 2577 2578 self.doc.pagesize = portrait(self.paper_size)

2579 2580 # -------------------------------------------------------------------------

2581 - def setLandscape(self):

2582 """ 2583 Method to set the orientation of the document to be landscape 2584 2585 @todo: make this for a page rather than the document 2586 """ 2587 2588 self.doc.pagesize = landscape(self.paper_size)

2589 2590 # -------------------------------------------------------------------------

2591 - def addTable(self, 2592 resource = None, 2593 raw_data = None, 2594 list_fields=None, 2595 report_groupby=None, 2596 report_hide_comments=False 2597 ):

2598 """ 2599 Method to create a table that will be inserted into the document 2600 2601 @param resource: A S3Resource object 2602 @param list_Fields: A list of field names 2603 @param report_groupby: A field name that is to be used as a sub-group 2604 All the records that share the same report_groupby value will 2605 be clustered together 2606 @param report_hide_comments: Any comment field will be hidden 2607 2608 This uses the class S3PDFTable to build and properly format the table. 2609 The table is then built and stored in the document flow ready for 2610 generating the pdf. 2611 2612 If the table is too wide for the page then it will automatically 2613 adjust the margin, font or page orientation. If it is still too 2614 wide then the table will be split across multiple pages. 2615 """ 2616 2617 table = S3PDFTable(document=self, 2618 resource=resource, 2619 raw_data=raw_data, 2620 list_fields=list_fields, 2621 groupby=report_groupby, 2622 hide_comments=report_hide_comments 2623 ) 2624 result = table.build() 2625 if result != None: 2626 self.content += result

2627 2628 # -------------------------------------------------------------------------

2629 - def extractrHeader(self, 2630 rHeader 2631 ):

2632 """ 2633 Method to convert the HTML generated for a rHeader into PDF 2634 """ 2635 2636 # let's assume that it's a callable rHeader 2637 try: 2638 # switch the representation to html so the rHeader doesn't barf 2639 repr = self.r.representation 2640 self.r.representation = "html" 2641 html = rHeader(self.r) 2642 self.r.representation = repr 2643 except: 2644 # okay so maybe it wasn't ... it could be an HTML object 2645 html = rHeader 2646 parser = S3html2pdf(pageWidth = self.doc.width, 2647 exclude_class_list=["tabs"]) 2648 result = parser.parse(html) 2649 if result != None: 2650 self.content += result

2651 2652 # -------------------------------------------------------------------------

2653 - def addrHeader(self, 2654 resource = None, 2655 raw_data = None, 2656 list_fields=None, 2657 report_hide_comments=False 2658 ):

2659 """ 2660 Method to create a rHeader table that is inserted into the document 2661 2662 @param resource: A S3Resource object 2663 @param list_Fields: A list of field names 2664 @param report_hide_comments: Any comment field will be hidden 2665 2666 This uses the class S3PDFTable to build and properly format the table. 2667 The table is then built and stored in the document flow ready for 2668 generating the pdf. 2669 """ 2670 2671 rHeader = S3PDFRHeader(self, 2672 resource, 2673 raw_data, 2674 list_fields, 2675 report_hide_comments 2676 ) 2677 result = rHeader.build() 2678 if result != None: 2679 self.content += result

2680 2681 # -------------------------------------------------------------------------

2682 - def addPlainTable(self, text, style=None, append=True):

2683 """ 2684 """ 2685 2686 table = Table(text, style=style) 2687 if append: 2688 self.content.append(table) 2689 return table

2690 2691 # -------------------------------------------------------------------------

2692 - def addParagraph(self, text, style=None, append=True):

2693 """ 2694 Method to create a paragraph that may be inserted into the document 2695 2696 @param text: The text for the paragraph 2697 @param append: If True then the paragraph will be stored in the 2698 document flow ready for generating the pdf. 2699 2700 @return The paragraph 2701 2702 This method can return the paragraph rather than inserting into the 2703 document. This is useful if the paragraph needs to be first 2704 inserted in another flowable, before being added to the document. 2705 An example of when this is useful is when large amounts of text 2706 (such as a comment) are added to a cell of a table. 2707 """ 2708 2709 if text != "": 2710 if style == None: 2711 styleSheet = getSampleStyleSheet() 2712 style = styleSheet["Normal"] 2713 para = Paragraph(text, style) 2714 if append: 2715 self.content.append(para) 2716 return para 2717 return ""

2718 2719 # -------------------------------------------------------------------------

2720 - def addSpacer(self, height, append=True):

2721 """ 2722 Add a spacer to the story 2723 """ 2724 2725 spacer = Spacer(1, height) 2726 if append: 2727 self.content.append(spacer) 2728 return spacer

2729 2730 # -------------------------------------------------------------------------

2731 - def addOverlay(self, callback, data):

2732 """ 2733 Add an overlay to the page 2734 """ 2735 2736 self.content.append(Overlay(callback, data))

2737 2738 # -------------------------------------------------------------------------

2739 - def addBoxes(self, cnt, append=True):

2740 """ 2741 Add square text boxes for text entry to the story 2742 """ 2743 2744 boxes = StringInputBoxes(cnt, etree.Element("dummy")) 2745 if append: 2746 self.content.append(boxes) 2747 return boxes

2748 2749 # -------------------------------------------------------------------------

2750 - def throwPageBreak(self):

2751 """ 2752 Method to force a page break in the report 2753 """ 2754 2755 self.content.append(PageBreak())

2756 2757 # -------------------------------------------------------------------------

2758 - def changePageTitle(self, newTitle):

2759 """ 2760 Method to force a page break in the report 2761 """ 2762 2763 self.content.append(ChangePageTitle(self, newTitle))

2764 2765 # -------------------------------------------------------------------------

2766 - def getStyledTable(self, table, colWidths=None, rowHeights = None, style=[]):

2767 """ 2768 Method to create a simple table 2769 """ 2770 2771 (list, style) = self.addCellStyling(table, style) 2772 return Table(list, 2773 colWidths=colWidths, 2774 rowHeights=rowHeights, 2775 style=style, 2776 )

2777 2778 # -------------------------------------------------------------------------

2779 - def getTableMeasurements(self, tempTable):

2780 """ 2781 Method to calculate the dimensions of the table 2782 """ 2783 2784 tempDoc = EdenDocTemplate(StringIO()) 2785 tempDoc.setPageTemplates(lambda x, y: None, lambda x, y: None) 2786 tempDoc.pagesize = portrait(self.paper_size) 2787 tempDoc.build([tempTable], canvasmaker=canvas.Canvas) 2788 return (tempTable._colWidths, tempTable._rowHeights)

2789 2790 # -------------------------------------------------------------------------

2791 - def cellStyle(self, style, cell):

2792 """ 2793 Add special styles to the text in a cell 2794 """ 2795 2796 if style == "*GREY": 2797 return [("TEXTCOLOR", cell, cell, colors.lightgrey)] 2798 elif style == "*RED": 2799 return [("TEXTCOLOR", cell, cell, colors.red)] 2800 return []

2801 2802 # -------------------------------------------------------------------------

2803 - def addCellStyling(self, table, style):

2804 """ 2805 Add special styles to the text in a table 2806 """ 2807 2808 row = 0 2809 for line in table: 2810 col = 0 2811 for cell in line: 2812 try: 2813 if cell.startswith("*"): 2814 (instruction,sep,text) = cell.partition(" ") 2815 style += self.cellStyle(instruction, (col, row)) 2816 table[row][col] = text 2817 except: 2818 pass 2819 col += 1 2820 row += 1 2821 return (table, style)

2822 2823 # -------------------------------------------------------------------------

2824 - def setHeaderBanner (self, image):

2825 """ 2826 Method to add a banner to a page 2827 used by pageHeader 2828 """ 2829 2830 self.headerBanner = os.path.join(current.request.folder,image)

2831 2832 # -------------------------------------------------------------------------

2833 - def pageHeader(self, canvas, doc):

2834 """ 2835 Method to generate the basic look of a page. 2836 It is a callback method and will not be called directly 2837 """ 2838 2839 canvas.saveState() 2840 if self.logo and os.path.exists(self.logo): 2841 im = Image.open(self.logo) 2842 (iwidth, iheight) = im.size 2843 height = 1.0 * inch 2844 width = iwidth * (height/iheight) 2845 canvas.drawImage(self.logo, 2846 inch, 2847 doc.pagesize[1] - 1.2 * inch, 2848 width = width, 2849 height = height) 2850 if self.headerBanner and os.path.exists(self.headerBanner): 2851 im = Image.open(self.headerBanner) 2852 (iwidth, iheight) = im.size 2853 height = 0.75 * inch 2854 width = iwidth * (height / iheight) 2855 canvas.drawImage(self.headerBanner, 2856 3 * inch, 2857 doc.pagesize[1] - 0.95 * inch, 2858 width = width, 2859 height = height) 2860 canvas.setFont("Helvetica-Bold", 14) 2861 canvas.drawCentredString(doc.pagesize[0] / 2.0, 2862 doc.pagesize[1] - 1.3*inch, self.title 2863 ) 2864 canvas.setFont("Helvetica-Bold", 8) 2865 now = S3DateTime.datetime_represent(datetime.utcnow(), utc=True) 2866 canvas.drawCentredString(doc.pagesize[0] - 1.5 * inch, 2867 doc.pagesize[1] - 1.3 * inch, now 2868 ) 2869 canvas.restoreState()

2870 2871 # -------------------------------------------------------------------------

2872 - def pageFooter(self, canvas, doc):

2873 """ 2874 Method to generate the basic look of a page. 2875 It is a callback method and will not be called directly 2876 """ 2877 2878 canvas.saveState() 2879 canvas.setFont("Helvetica", 7) 2880 canvas.drawString(inch, 0.75 * inch, 2881 "Page %d %s" % (doc.page, 2882 self.prevtitle 2883 ) 2884 ) 2885 self.prevtitle = self.title 2886 canvas.restoreState()

2887 2888 # -------------------------------------------------------------------------

2889 - def buildDoc(self):

2890 """ 2891 Method to build the PDF document. 2892 The response headers are set up for a pdf document and the document 2893 is then sent 2894 2895 @return the document as a stream of characters 2896 2897 @todo add a proper template class so that the doc.build is more generic 2898 """ 2899 2900 styleSheet = getSampleStyleSheet() 2901 self.doc.build(self.content, 2902 canvasmaker=canvas.Canvas) 2903 self.output.seek(0) 2904 return self.output.read()

2905 2906 # Nested classes that extend external libraries 2907 # If the external library failed to be imported then we get a stacktrace 2908 if reportLabImported: 2909 2910 # =====================================================================

2911 - class S3PDFOCRForm(BaseDocTemplate):

2912 """ 2913 Extended class of the BaseDocTemplate to be used with OCR Forms. 2914 The form has a standard page template that draws handles on the 2915 page in the four corners, the middle of the side and bottom edges 2916 """ 2917 2918 _invalidInitArgs = ("pageTemplates",) 2919 2920 # -----------------------------------------------------------------

2921 - def __init__(self, filename, **attr):

2922 2923 BaseDocTemplate.__init__(self, filename, **attr) 2924 self.formUUID = attr.get("formUUID", "") 2925 self.formResourceName = attr.get("formResourceName", "") 2926 self.formRevision = attr.get("formRevision", "") 2927 self.pdfTitle = attr.get("pdfTitle", "OCR Form") 2928 self.content = [] 2929 self.leftMargin = 20 2930 self.rightMargin = 20 2931 self.topMargin = 20 2932 self.bottomMargin = 20 2933 settings = current.deployment_settings 2934 if settings.get_pdf_size() == "Letter": 2935 self.paper_size = LETTER 2936 else: 2937 self.paper_size = A4

2938 2939 # -----------------------------------------------------------------

2940 - def handle_pageBegin(self):

2941 """ 2942 Override base method to add a change of page template after 2943 the firstpage. 2944 """ 2945 2946 self._handle_pageBegin() 2947 self._handle_nextPageTemplate("Later")

2948 2949 # -----------------------------------------------------------------

2950 - def build(self, content=[], canvasmaker=canvas.Canvas, **attr):

2951 """ 2952 Build the document using the flowables. 2953 """ 2954 2955 T = current.T 2956 self._calc() # in case we changed margins sizes etc 2957 frameT = Frame(self.leftMargin, 2958 self.bottomMargin, 2959 self.width, 2960 self.height, 2961 id="normal") 2962 self.addPageTemplates([PageTemplate(id="First", 2963 frames=frameT, 2964 onPage=self.firstPageTemplate, 2965 pagesize=self.pagesize), 2966 PageTemplate(id="Later", 2967 frames=frameT, 2968 onPage=self.laterPageTemplate, 2969 pagesize=self.pagesize)]) 2970 2971 # Generate PDF header 2972 ocrInstructions = [ 2973 T("1. Fill the necessary fields in BLOCK CAPITAL letters.").decode("utf-8"), 2974 T("2. Always use one box per letter and leave one box space to separate words.").decode("utf-8"), 2975 T("3. Fill in the circles completely.").decode("utf-8"), 2976 ] 2977 # Put pdf title 2978 styleSheet = getStyleSheet() 2979 self.content = [Paragraph(html_unescape_and_strip(self.pdfTitle), styleSheet["Title"])] 2980 # Print input instructions 2981 append = self.content.append 2982 for eachInstruction in ocrInstructions: 2983 append(Paragraph(html_unescape_and_strip(eachInstruction), 2984 styleSheet["Instructions"])) 2985 2986 # Add content 2987 self.content.extend(content) 2988 # Build OCRable PDF form 2989 BaseDocTemplate.build(self, self.content, 2990 canvasmaker=canvasmaker) 2991 self.numPages = self.canv.getPageNumber() - 1

2992 2993 # -----------------------------------------------------------------

2994 - def firstPageTemplate(self, canvas, doc):

2995 """ 2996 Template for first page 2997 """ 2998 2999 self.laterPageTemplate(canvas, doc)

3000 3001 # -----------------------------------------------------------------

3002 - def laterPageTemplate(self, canvas, doc):

3003 """ 3004 Template for all pages but first 3005 """ 3006 3007 self.pageDecorate(canvas, doc) 3008 self.pageMeta(canvas, doc)

3009 3010 # -----------------------------------------------------------------

3011 - def pageDecorate(self, canvas, doc):

3012 """ 3013 Decorate Page with blocks for OCR-ability 3014 """ 3015 3016 canvas.saveState() 3017 pagewidth, pageheight = self.paper_size 3018 canvas.rect(10, 10, 10, 10, fill=1) #btlf 3019 canvas.rect(pagewidth - 20, 10, 10, 10, fill=1) #btrt 3020 canvas.rect(10, pageheight - 20, 10, 10, fill=1) #tplf 3021 canvas.rect(pagewidth / 2 - 5, 10, 10, 10, fill=1) #btmd 3022 canvas.rect(10, pageheight / 2 - 5, 10, 10, fill=1) #mdlf 3023 canvas.rect(pagewidth - 20, 3024 pageheight - 20, 10, 10, fill=1) #tprt 3025 canvas.rect(pagewidth - 20, 3026 pageheight / 2 - 5, 10, 10, fill=1) #mdrt 3027 canvas.restoreState()

3028 3029 # -----------------------------------------------------------------

3030 - def pageMeta(self, canvas, doc):

3031 """ 3032 Put pagenumber and other meta info on each page 3033 """ 3034 3035 canvas.saveState() 3036 canvas.setFont("Helvetica", 7) 3037 pageNumberText = "Page %s" % self.canv.getPageNumber() 3038 pagewidth, pageheight = self.paper_size 3039 metaHeight = 14 3040 pageNumberWidth = pagewidth - (((len(pageNumberText) + 2) * 5) + 40) 3041 pageNumberHeight = metaHeight 3042 canvas.drawString(pageNumberWidth, pageNumberHeight, pageNumberText) 3043 3044 uuidText = "UUID %s" % self.formUUID 3045 uuidWidth = 40 3046 uuidHeight = metaHeight 3047 canvas.drawString(uuidWidth, uuidHeight, uuidText) 3048 resourceNameText = self.formResourceName 3049 revisionText = self.formRevision 3050 otherMetaText = "Resource %s Revision %s" % (resourceNameText, 3051 revisionText) 3052 otherMetaWidth = (pagewidth / 2) + 20 3053 otherMetaHeight = metaHeight 3054 canvas.drawString(otherMetaWidth, otherMetaHeight, otherMetaText) 3055 canvas.restoreState()

3056

3057 # ============================================================================= 3058 -class S3PDFDataSource:

3059 """ 3060 Class to get the labels and the data from the database 3061 """ 3062

3063 - def __init__(self, obj):

3064 """ 3065 Method to create the S3PDFDataSource object 3066 """ 3067 3068 self.resource = obj.resource 3069 self.list_fields = obj.list_fields 3070 self.report_groupby = obj.report_groupby 3071 self.hideComments = obj.hideComments 3072 self.fields = None 3073 self.labels = None 3074 self.records = False

3075 3076 # -------------------------------------------------------------------------

3077 - def select(self):

3078 """ 3079 Internally used method to get the data from the database 3080 3081 If the list of fields is provided then only these will be returned 3082 otherwise all fields on the table will be returned 3083 3084 Automatically the id field will be hidden, and if 3085 hideComments is true then the comments field will also be hidden. 3086 3087 If a groupby field is provided then this will be used as the sort 3088 criteria, otherwise it will sort by the first field 3089 3090 The returned records are stored in the records property. 3091 """ 3092 3093 resource = self.resource 3094 list_fields = self.list_fields 3095 if not list_fields: 3096 fields = resource.readable_fields() 3097 for field in fields: 3098 if field.type == "id": 3099 fields.remove(field) 3100 if self.hideComments and field.name == "comments": 3101 fields.remove(field) 3102 if not fields: 3103 fields = [table.id] 3104 list_fields = [f.name for f in fields] 3105 else: 3106 indices = s3codec.S3Codec.indices 3107 list_fields = [f for f in list_fields if f not in indices] 3108 3109 # Filter and orderby 3110 filter = current.response.s3.filter 3111 if filter is not None: 3112 resource.add_filter(filter) 3113 orderby = self.report_groupby 3114 3115 # Retrieve the resource contents 3116 table = resource.table 3117 rfields = resource.resolve_selectors(list_fields)[0] 3118 fields = [f for f in rfields if f.show] 3119 headers = [f.label for f in rfields if f.show] 3120 if orderby != None: 3121 orderby = fields[0].field 3122 self.records = resource.select(list_fields, 3123 limit=None, 3124 orderby=orderby, 3125 as_rows=True) 3126 3127 # Pass to getLabels 3128 self.labels = headers 3129 # Pass to getData 3130 self.fields = fields

3131 # Better to return a PDF, even if it has no records 3132 #if not self.records: 3133 # current.session.warning = current.ERROR.NO_RECORDS 3134 # redirect(URL(extension="")) 3135 3136 # -------------------------------------------------------------------------

3137 - def getLabels(self):

3138 """ 3139 Internally used method to get the field labels 3140 3141 Used to remove the report_groupby label (if present) 3142 """ 3143 3144 # Collect the labels from the select() call 3145 labels = self.labels 3146 if self.report_groupby != None: 3147 for label in labels: 3148 if label == self.report_groupby.label: 3149 labels.remove(label) 3150 return labels

3151 3152 # -------------------------------------------------------------------------

3153 - def getData(self):

3154 """ 3155 Internally used method to format the data from the database 3156 3157 This will extract the data from the returned records list. 3158 3159 If there is a groupby then the records will be grouped by this field. 3160 For each new value the groupby field will be placed in a list of 3161 its own. This will then be followed by lists of the records that 3162 share this value 3163 3164 If there is no groupby then the result is a simple matrix of 3165 rows by fields 3166 """ 3167 3168 # Build the data list 3169 data = [] 3170 currentGroup = None 3171 subheadingList = [] 3172 rowNumber = 1 3173 for item in self.records: 3174 row = [] 3175 if self.report_groupby != None: 3176 # @ToDo: non-XML output should use Field.represent 3177 # - this saves the extra parameter 3178 groupData = s3_represent_value(self.report_groupby, 3179 record=item, 3180 strip_markup=True, 3181 non_xml_output=True 3182 ) 3183 if groupData != currentGroup: 3184 currentGroup = groupData 3185 data.append([groupData]) 3186 subheadingList.append(rowNumber) 3187 rowNumber += 1 3188 3189 for field in self.fields: 3190 if self.report_groupby != None: 3191 if field.label == self.report_groupby.label: 3192 continue 3193 if field.field: 3194 text = s3_represent_value(field.field, 3195 record=item, 3196 strip_markup=True, 3197 non_xml_output=True, 3198 extended_comments=True 3199 ) 3200 if text == "" or not field.field: 3201 # some represents replace the data with an image which will 3202 # then be lost by the strip_markup, so get back what we can 3203 tname = field.tname 3204 fname = field.fname 3205 if fname in item: 3206 text = item[fname] 3207 elif tname in item and fname in item[tname]: 3208 text = item[tname][fname] 3209 else: 3210 text = "" 3211 row.append(text) 3212 data.append(row) 3213 rowNumber += 1 3214 return (subheadingList, data)

3215

3216 # ============================================================================= 3217 -class S3PDFRHeader():

3218 """ 3219 Class to build a simple table that holds the details of one record, 3220 which can then be placed in a pdf document 3221 3222 This class doesn't need to be called directly. 3223 Rather see S3PDF.addrHeader() 3224 """ 3225

3226 - def __init__(self, 3227 document, 3228 resource=None, 3229 raw_data=None, 3230 list_fields=None, 3231 hide_comments=False 3232 ):

3233 """ 3234 Method to create an rHeader object 3235 3236 @param document: An S3PDF object 3237 @param resource: An S3Resource object 3238 @param list_fields: A list of field names 3239 @param hide_comments: Any comment field will be hidden 3240 """ 3241 3242 self.pdf = document 3243 self.resource = resource 3244 self.raw_data = raw_data 3245 self.list_fields = list_fields 3246 self.hideComments = hide_comments 3247 self.report_groupby = None 3248 self.data = [] 3249 self.subheadingList = [] 3250 self.labels = [] 3251 self.fontsize = 10

3252 3253 # -------------------------------------------------------------------------

3254 - def build(self):

3255 """ 3256 Method to build the table. 3257 3258 @return: A list of Table objects. Normally this will be a list with 3259 just one table object, but if the table needs to be split 3260 across columns then one object per page will be created. 3261 """ 3262 3263 if self.resource != None: 3264 ds = S3PDFDataSource(self) 3265 # Get records 3266 ds.select() 3267 self.labels = ds.getLabels() 3268 self.data.append(self.labels) 3269 (self.subheadingList, data) = ds.getData() 3270 self.data + data 3271 3272 if self.raw_data != None: 3273 self.data = self.raw_data 3274 3275 self.rheader = [] 3276 if len(self.data) == 0: 3277 return None 3278 else: 3279 NONE = current.messages["NONE"] 3280 for index in range(len(self.labels)): 3281 try: 3282 value = data[0][index] 3283 except: 3284 value = NONE 3285 self.rheader.append([self.labels[index], 3286 value]) 3287 content = [] 3288 style = [("FONTSIZE", (0, 0), (-1, -1), self.fontsize), 3289 ("VALIGN", (0, 0), (-1, -1), "TOP"), 3290 ("FONTNAME", (0, 0), (0, -1), "Helvetica-Bold"), 3291 ("FONTNAME", (1, 0), (1, -1), "Helvetica"), 3292 ] 3293 (self.rheader,style) = self.pdf.addCellStyling(self.rheader, style) 3294 table = Table(self.rheader, 3295 repeatRows=1, 3296 style=style, 3297 hAlign="LEFT", 3298 ) 3299 content.append(table) 3300 return content

3301 3302 # ============================================================================= 3303 # Custom Flowables (used by OCR) 3304 if reportLabImported:

3305 3306 # ========================================================================= 3307 - class DrawHrLine(Flowable):

3308 """ 3309 Draw a horizontal line 3310 """ 3311

3312 - def __init__(self, lineThickness):

3313 Flowable.__init__(self) 3314 self.lineThickness = 1 3315 if current.deployment_settings.get_pdf_size() == "Letter": 3316 self.paper_size = LETTER 3317 else: 3318 self.paper_size = A4

3319 3320 # ---------------------------------------------------------------------

3321 - def draw(self):

3322 canv = self.canv 3323 pagewidth, pageheight = self.paper_size 3324 self.canv.line(0, -5, pagewidth - 100, -5)

3325 3326 # ---------------------------------------------------------------------

3327 - def wrap(self, availWidth, availHeight):

3328 self._width = availWidth 3329 self._height = self.lineThickness 3330 return self._width, self._height

3331

3332 # ========================================================================= 3333 - class StringInputBoxes(Flowable):

3334 """ 3335 Draw input boxes in a complete line 3336 """ 3337

3338 - def __init__(self, numBoxes=None, etreeElem=None):

3339 Flowable.__init__(self) 3340 self.spaceAfter = 2 3341 self.sideLength = 15 3342 self.numBoxes = numBoxes 3343 self.fontsize = 10 3344 self.etreeElem = etreeElem 3345 if current.deployment_settings.get_pdf_size() == "Letter": 3346 self.paper_size = LETTER 3347 else: 3348 self.paper_size = A4

3349 3350 # ---------------------------------------------------------------------

3351 - def draw(self):

3352 canv = self.canv 3353 pagewidth, pageheight = self.paper_size 3354 numBoxes = int((pagewidth - (100 + self.fontsize)) / self.sideLength) 3355 if self.numBoxes != None and\ 3356 isinstance(self.numBoxes, int): 3357 numBoxes = self.numBoxes 3358 canv.setLineWidth(0.90) 3359 canv.setStrokeGray(0.9) 3360 widthPointer = self.fontsize 3361 # values are set manually 3362 xpadding = 6 # default 3363 ypadding = 4 3364 margin = 50 # as set 3365 # Reportlab's coordinate system uses bottom left 3366 # as origin, so we have to take top left marker as 3367 # origin to provide input for Python Imaging. 3368 markerOrigin = (29, 29) # top left marker location 3369 xCoord = pagewidth - \ 3370 (self.layoutCoords[0] + xpadding + margin) - \ 3371 markerOrigin[0] + \ 3372 self.fontsize 3373 yCoord = pageheight - \ 3374 (self.layoutCoords[1] + ypadding + margin) - \ 3375 markerOrigin[1] 3376 for box in xrange(numBoxes): 3377 self.canv.rect(widthPointer, 3378 0, 3379 self.sideLength, 3380 self.sideLength) 3381 widthPointer += self.sideLength 3382 StringInputBoxEtree = etree.SubElement(self.etreeElem, 3383 "textbox", 3384 x="%s" % xCoord, 3385 y="%s" % yCoord, 3386 side="%s" % self.sideLength, 3387 boxes="%s" % numBoxes, 3388 page="%s" % self.canv.getPageNumber()) 3389 StringInputBoxEtree.text = " "

3390 3391 # ---------------------------------------------------------------------

3392 - def wrap(self, availWidth, availHeight):

3393 self.layoutCoords = availWidth, availHeight 3394 self._width = availWidth 3395 self._height = self.sideLength + self.spaceAfter 3396 return self._width, self._height

3397

3398 # ========================================================================= 3399 - class DateBoxes(Flowable):

3400 """ 3401 Draw date boxes 3402 """ 3403

3404 - def __init__(self, etreeElem):

3405 Flowable.__init__(self) 3406 self.spaceAfter = 2 3407 self.sideLength = 15 3408 self.fontsize = 10 3409 self.etreeElem = etreeElem 3410 if current.deployment_settings.get_pdf_size() == "Letter": 3411 self.paper_size = LETTER 3412 else: 3413 self.paper_size = A4

3414 3415 # ---------------------------------------------------------------------

3416 - def draw(self):

3417 canv = self.canv 3418 pagewidth, pageheight = self.paper_size 3419 canv.setLineWidth(0.90) 3420 canv.setStrokeGray(0.9) 3421 widthPointer = self.fontsize 3422 # Values are set manually 3423 xpadding = 6 # default 3424 ypadding = 4 3425 margin = 50 # as set 3426 # Reportlab's coordinate system uses bottom left 3427 # as origin, so we have to take top left marker as 3428 # origin to provide input for Python Imaging. 3429 markerOrigin = (29, 29) # top left marker location 3430 xCoord = pagewidth - \ 3431 (self.layoutCoords[0] + xpadding + margin) - \ 3432 markerOrigin[0] + \ 3433 self.fontsize 3434 yCoord = pageheight - \ 3435 (self.layoutCoords[1] + ypadding + margin) - \ 3436 markerOrigin[1] 3437 3438 sideLength = self.sideLength 3439 rect = self.canv.rect 3440 for box in xrange(1, 11): 3441 if box not in (3, 6): 3442 rect(widthPointer, 3443 0, 3444 sideLength, 3445 sideLength) 3446 else: 3447 self.canv.drawString(widthPointer + 5, 3448 self.height, 3449 "/") 3450 widthPointer += 15 3451 getPageNumber = self.canv.getPageNumber 3452 dtformat = current.deployment_settings.get_L10n_datetime_format() 3453 if str(dtformat)[:2] == "%m": 3454 # US-style 3455 DateBoxEtree = etree.SubElement(self.etreeElem, 3456 "textbox", 3457 x="%s" % xCoord, 3458 y="%s" % yCoord, 3459 side="%s" % sideLength, 3460 boxes="2", 3461 page="%s" % getPageNumber()) 3462 DateBoxEtree.text = "MO" 3463 DateBoxEtree = etree.SubElement(self.etreeElem, 3464 "textbox", 3465 x="%s" % (xCoord + (sideLength * 3)), 3466 y="%s" % yCoord, 3467 side="%s" % sideLength, 3468 boxes="2", 3469 page="%s" % getPageNumber()) 3470 DateBoxEtree.text = "DD" 3471 else: 3472 # ISO-style 3473 DateBoxEtree = etree.SubElement(self.etreeElem, 3474 "textbox", 3475 x="%s" % xCoord, 3476 y="%s" % yCoord, 3477 side="%s" % sideLength, 3478 boxes="2", 3479 page="%s" % getPageNumber()) 3480 DateBoxEtree.text = "DD" 3481 DateBoxEtree = etree.SubElement(self.etreeElem, 3482 "textbox", 3483 x="%s" % (xCoord + (sideLength * 3)), 3484 y="%s" % yCoord, 3485 side="%s" % sideLength, 3486 boxes="2", 3487 page="%s" % getPageNumber()) 3488 DateBoxEtree.text = "MO" 3489 DateBoxEtree = etree.SubElement(self.etreeElem, 3490 "textbox", 3491 x="%s" % (xCoord + (sideLength * 6)), 3492 y="%s" % yCoord, 3493 side="%s" % sideLength, 3494 boxes="4", 3495 page="%s" % getPageNumber()) 3496 DateBoxEtree.text = "YYYY"

3497 3498 # ---------------------------------------------------------------------

3499 - def wrap(self, availWidth, availHeight):

3500 self.layoutCoords = availWidth, availHeight 3501 self._width = availWidth 3502 self._height = self.sideLength + self.spaceAfter 3503 return self._width, self._height

3504

3505 # ========================================================================= 3506 - class DateTimeBoxes(Flowable):

3507 """ 3508 Draw datetime boxes 3509 """ 3510

3511 - def __init__(self, etreeElem):

3512 Flowable.__init__(self) 3513 self.spaceAfter = 2 3514 self.sideLength = 15 3515 self.fontsize = 10 3516 self.etreeElem = etreeElem 3517 if current.deployment_settings.get_pdf_size() == "Letter": 3518 self.paper_size = LETTER 3519 else: 3520 self.paper_size = A4

3521 3522 # ---------------------------------------------------------------------

3523 - def draw(self):

3524 canv = self.canv 3525 pagewidth, pageheight = self.paper_size 3526 canv.setLineWidth(0.90) 3527 canv.setStrokeGray(0.9) 3528 widthPointer = self.fontsize 3529 # Values are set manually 3530 xpadding = 6 # default 3531 ypadding = 4 3532 margin = 50 # as set 3533 # Reportlab's coordinate system uses bottom-left 3534 # as origin, so we have to take top-left marker as 3535 # origin to provide input for Python Imaging. 3536 markerOrigin = (29, 29) # top-left marker location 3537 xCoord = pagewidth - \ 3538 (self.layoutCoords[0] + xpadding + margin) - \ 3539 markerOrigin[0]+\ 3540 self.fontsize 3541 yCoord = pageheight - \ 3542 (self.layoutCoords[1] + ypadding + margin) - \ 3543 markerOrigin[1] 3544 3545 for box in xrange(1, 18): 3546 if box not in (3, 6, 7, 10, 13): 3547 self.canv.rect(widthPointer, 3548 0, 3549 self.sideLength, 3550 self.sideLength) 3551 widthPointer += 15 3552 DateTimeBoxEtree = etree.SubElement(self.etreeElem, 3553 "textbox", 3554 x="%s" % xCoord, 3555 y="%s" % yCoord, 3556 side="%s" % self.sideLength, 3557 boxes="2", 3558 page="%s" % self.canv.getPageNumber()) 3559 DateTimeBoxEtree.text = "HH" 3560 DateTimeBoxEtree = etree.SubElement(self.etreeElem, 3561 "textbox", 3562 x="%s" % (xCoord + (self.sideLength * 3)), 3563 y="%s" % yCoord, 3564 side="%s" % self.sideLength, 3565 boxes="2", 3566 page="%s" % self.canv.getPageNumber()) 3567 DateTimeBoxEtree.text = "MM" 3568 dtformat = current.deployment_settings.get_L10n_datetime_format() 3569 if str(dtformat)[:2] == "%m": 3570 # US-style 3571 DateTimeBoxEtree = etree.SubElement(self.etreeElem, 3572 "textbox", 3573 x="%s" % (xCoord + (self.sideLength * 7)), 3574 y="%s" % yCoord, 3575 side="%s" % self.sideLength, 3576 boxes="2", 3577 page="%s" % self.canv.getPageNumber()) 3578 DateTimeBoxEtree.text = "MO" 3579 DateTimeBoxEtree = etree.SubElement(self.etreeElem, 3580 "textbox", 3581 x="%s" % (xCoord + (self.sideLength * 10)), 3582 y="%s" % yCoord, 3583 side="%s" % self.sideLength, 3584 boxes="2", 3585 page="%s" % self.canv.getPageNumber()) 3586 DateTimeBoxEtree.text = "DD" 3587 else: 3588 # ISO-style 3589 DateTimeBoxEtree = etree.SubElement(self.etreeElem, 3590 "textbox", 3591 x="%s" % (xCoord + (self.sideLength * 7)), 3592 y="%s" % yCoord, 3593 side="%s" % self.sideLength, 3594 boxes="2", 3595 page="%s" % self.canv.getPageNumber()) 3596 DateTimeBoxEtree.text = "DD" 3597 DateTimeBoxEtree = etree.SubElement(self.etreeElem, 3598 "textbox", 3599 x="%s" % (xCoord + (self.sideLength * 10)), 3600 y="%s" % yCoord, 3601 side="%s" % self.sideLength, 3602 boxes="2", 3603 page="%s" % self.canv.getPageNumber()) 3604 DateTimeBoxEtree.text = "MO" 3605 DateTimeBoxEtree = etree.SubElement(self.etreeElem, 3606 "textbox", 3607 x="%s" % (xCoord + (self.sideLength * 13)), 3608 y="%s" % yCoord, 3609 side="%s" % self.sideLength, 3610 boxes="4", 3611 page="%s" % self.canv.getPageNumber()) 3612 DateTimeBoxEtree.text = "YYYY"

3613 3614 # ---------------------------------------------------------------------

3615 - def wrap(self, availWidth, availHeight):

3616 self.layoutCoords = availWidth, availHeight 3617 self._width = availWidth 3618 self._height = self.sideLength + self.spaceAfter 3619 return self._width, self._height

3620

3621 # ========================================================================= 3622 - class DrawOptionBoxes(Flowable):

3623 """ 3624 Draw a set of Option Boxes (for Boolean or Multi-Select) 3625 - along with Labels 3626 """ 3627

3628 - def __init__(self, etreeElem, labels, values):

3629 Flowable.__init__(self) 3630 self.etreeElem = etreeElem 3631 self.fontsize = 8 3632 self.spaceAfter = 2 3633 self.labels = labels 3634 self.text = labels[0] 3635 self.values = values 3636 if current.deployment_settings.get_pdf_size() == "Letter": 3637 self.paper_size = LETTER 3638 else: 3639 self.paper_size = A4

3640 3641 # ---------------------------------------------------------------------

3642 - def draw(self):

3643 canv = self.canv 3644 pagewidth, pageheight = self.paper_size 3645 canv.setLineWidth(0.90) 3646 canv.setStrokeGray(0.9) 3647 fontsize = self.fontsize 3648 radius = (fontsize / 2) - 1 3649 # Values are set manually 3650 xpadding = 6 # default 3651 ypadding = 8 3652 margin = 50 # as set 3653 # Reportlab's coordinate system uses bottom left 3654 # as origin, so we have to take top-left marker as 3655 # origin to provide input for Python Imaging. 3656 markerOrigin = (29, 29) # top-left marker location 3657 layoutCoords = self.layoutCoords 3658 pwidth = pagewidth - (layoutCoords[0] + xpadding + margin) - markerOrigin[0] 3659 pheight = pageheight - (layoutCoords[1] + ypadding + margin) - markerOrigin[1] 3660 labels = self.labels 3661 index = 0 3662 values = self.values 3663 circle = self.canv.circle 3664 drawString = self.canv.drawString 3665 getPageNumber = self.canv.getPageNumber 3666 etreeElem = self.etreeElem 3667 height = self.height 3668 cheight = height + (fontsize / 4) + 1 3669 width = self.width 3670 # Width of the circle 3671 cwidth = width + fontsize 3672 # Initial X for the elements 3673 _cwidth = width + fontsize 3674 _swidth = width + (fontsize * 2) 3675 for label in labels: 3676 # Draw circle to fill-in 3677 circleCenter = (_cwidth, cheight) 3678 circle(circleCenter[0], 3679 circleCenter[1], 3680 radius, 3681 fill=0) 3682 # Add label 3683 drawString(_swidth, height, 3684 html_unescape_and_strip(label)) 3685 xCoord = pwidth + circleCenter[0] 3686 yCoord = pheight + circleCenter[0] 3687 optionBoxEtree = etree.SubElement(etreeElem, 3688 "optionbox", 3689 x="%s" % xCoord, 3690 y="%s" % yCoord, 3691 radius="%s" % radius, 3692 boxes="1", 3693 page="%s" % getPageNumber()) 3694 optionBoxEtree.set("value", values[index]) 3695 optionBoxEtree.text = label 3696 xwidth = cwidth + (fontsize * (len(label) + 2)) / 1.4 3697 _cwidth += xwidth 3698 _swidth += xwidth 3699 index += 1

3700 3701 # ---------------------------------------------------------------------

3702 - def wrap(self, availWidth, availHeight):

3703 self.layoutCoords = availWidth, availHeight 3704 width = 0 3705 for label in self.labels: 3706 width += (len(label) + 8) 3707 fontsize = self.fontsize 3708 self._width = (fontsize * width) / 2 3709 self._height = fontsize + self.spaceAfter 3710 return self._width, self._height

3711

3712 # ========================================================================= 3713 - class DrawHintBox(Flowable):

3714 """ 3715 Draw Help Text to explain how to fill out a question 3716 """ 3717

3718 - def __init__(self, text=""):

3719 Flowable.__init__(self) 3720 self.text = text 3721 self.fontsize = 6 3722 self.spaceAfter = 2 3723 if current.deployment_settings.get_pdf_size() == "Letter": 3724 self.paper_size = LETTER 3725 else: 3726 self.paper_size = A4

3727 3728 # ---------------------------------------------------------------------

3729 - def draw(self):

3730 canv = self.canv 3731 canv.setFillGray(0.4) 3732 self.canv.drawString(self.width + (self.fontsize / 2), 3733 self.height, 3734 html_unescape_and_strip(self.text))

3735 3736 # ---------------------------------------------------------------------

3737 - def wrap(self, availWidth, availHeight):

3738 fontsize = self.fontsize 3739 self._width = (fontsize * (len(self.text) + 4)) / 2 3740 self._height = fontsize + self.spaceAfter 3741 return self._width, self._height

3742 3743 # ------------------------------------------------------------------------- 3744 # Custom styleSheets 3745 _baseFontNameB = tt2ps(_baseFontName, 1, 0) 3746 _baseFontNameI = tt2ps(_baseFontName, 0, 1) 3747 _baseFontNameBI = tt2ps(_baseFontName, 1, 1)

3748 3749 - def getStyleSheet():

3750 """ 3751 """ 3752 3753 styleSheet = getSampleStyleSheet() 3754 styleSheet.add(ParagraphStyle(name="Instructions", 3755 parent=styleSheet["Bullet"], 3756 fontName=_baseFontName, 3757 fontSize=12, 3758 firstLineIndent=0, 3759 spaceBefore=3), 3760 alias="Inst") 3761 styleSheet.add(ParagraphStyle(name="Section", 3762 parent=styleSheet["Normal"], 3763 fontName=_baseFontName, 3764 fontSize=13, 3765 spaceBefore=5, 3766 spaceAfter=5, 3767 firstLineIndent=0), 3768 alias="Sec") 3769 styleSheet.add(ParagraphStyle(name="Question", 3770 parent=styleSheet["Normal"], 3771 fontName=_baseFontName, 3772 fontSize=11, 3773 firstLineIndent=0, 3774 spaceAfter=5, 3775 spaceBefore=10), 3776 alias="Quest") 3777 styleSheet.add(ParagraphStyle(name="DefaultAnswer", 3778 parent=styleSheet["Normal"], 3779 fontName=_baseFontName, 3780 fontSize=10, 3781 firstLineIndent=0, 3782 spaceBefore=3), 3783 alias="DefAns") 3784 return styleSheet

3785 3786 # Helper functions (used by OCR) 3787 html_unescape_and_strip = lambda m: html_strip(html_unescape(m))

3788 3789 # ============================================================================= 3790 -def html_unescape(text):

3791 """ 3792 Helper function, unscape any html special characters 3793 """ 3794 3795 return re.sub("&(%s);" % "|".join(name2codepoint), 3796 lambda m: unichr(name2codepoint[m.group(1)]), 3797 text)

3798

3799 # ============================================================================= 3800 -def html_strip(text):

3801 """ 3802 Strips html markup from text 3803 """ 3804 3805 mark = 0 3806 markstart = 0 3807 markend = 0 3808 index = 0 3809 occur = 0 3810 for i in text: 3811 if i == "<": 3812 try: 3813 if text[index+1] != " ": 3814 mark = 1 3815 markstart = index 3816 except(IndexError): 3817 pass 3818 elif i == ">": 3819 if mark == 1: 3820 mark = 0 3821 markend = index 3822 text = "%s%s" % (text[:markstart], text[markend+1:]) 3823 occur = 1 3824 break 3825 3826 index += 1 3827 3828 if occur == 1: 3829 text = html_strip(text) 3830 3831 return text

3832 3833 # ============================================================================= 3834 # Convert unicode to ascii compatible strings 3835 cast2ascii = lambda m: \ 3836 m if isinstance(m, str) else unicodedata.normalize("NFKD", 3837 m).encode("ascii", 3838 "ignore")

3839 3840 # ============================================================================= 3841 -class S3OCRImageParser(object):

3842 """ 3843 Image Parsing and OCR Utility 3844 """ 3845

3846 - def __init__(self, s3method, r):

3847 """ 3848 Intialise class instance with environment variables and functions 3849 """ 3850 3851 self.r = r 3852 self.request = current.request 3853 checkDependencies(r)

3854 3855 # -------------------------------------------------------------------------

3856 - def parse(self, form_uuid, set_uuid, **kwargs):

3857 """ 3858 Performs OCR on a given set of pages 3859 """ 3860 3861 raw_images = {} 3862 images = {} 3863 3864 self.set_uuid = set_uuid 3865 db = current.db 3866 T = current.T 3867 request = self.request 3868 3869 # Get metadata of the form 3870 metatable = "ocr_meta" 3871 query = (db[metatable]["form_uuid"] == form_uuid) 3872 row = db(query).select(limitby=(0, 1)).first() 3873 revision = row["revision"] 3874 resourcename = row["resource_name"] 3875 layoutfilename = row["layout_file"] 3876 pages = int(row["pages"]) 3877 is_component = True if len(self.r.resource.components) == 1 else False 3878 3879 # Open each page 3880 for eachpage in xrange(1, pages+1): 3881 payloadtable = "ocr_payload" 3882 row =\ 3883 db((db[payloadtable]["image_set_uuid"]==set_uuid) &\ 3884 (db[payloadtable]["page_number"]==eachpage) 3885 ).select().first() 3886 3887 pageimagefile = row["image_file"] 3888 raw_images[eachpage] =\ 3889 Image.open(os.path.join(self.r.folder, 3890 "uploads", 3891 "ocr_payload", 3892 pageimagefile)) 3893 3894 # Transform each image 3895 for each_img_index in raw_images.keys(): 3896 images[each_img_index] = {} 3897 images[each_img_index]["image"] =\ 3898 self.__convertImage2binary(raw_images[each_img_index]) 3899 images[each_img_index]["markers"] =\ 3900 self.__getMarkers(images[each_img_index]["image"]) 3901 images[each_img_index]["orientation"] =\ 3902 self.__getOrientation(images[each_img_index]["markers"]) 3903 if images[each_img_index]["orientation"] != 0.0: 3904 images[each_img_index]["image"] =\ 3905 images[each_img_index]["image"].rotate(images[each_img_index]["orientation"]) 3906 images[each_img_index]["markers"] =\ 3907 self.__getMarkers(images[each_img_index]["image"]) 3908 images[each_img_index]["orientation"] =\ 3909 self.__getOrientation(images[each_img_index]["markers"]) 3910 3911 images[each_img_index]["scalefactor"] =\ 3912 self.__scaleFactor(images[each_img_index]["markers"]) 3913 3914 # Get layout file, convert it to etree 3915 layout_file = open(os.path.join(self.r.folder, 3916 "uploads", 3917 "ocr_meta", 3918 layoutfilename), 3919 "rb") 3920 layout_xml = layout_file.read() 3921 layout_file.close() 3922 layout_etree = etree.fromstring(layout_xml) 3923 3924 # Data etree 3925 s3xml_root_etree = etree.Element("s3xml") 3926 parent_resource_exist = False 3927 3928 SubElement = etree.SubElement 3929 for resource in layout_etree: 3930 # Create data etree 3931 if not is_component: 3932 if parent_resource_exist == False: 3933 s3xml_parent_resource_etree = SubElement(s3xml_root_etree, 3934 "resource") 3935 s3xml_resource_etree = s3xml_parent_resource_etree 3936 parent_resource_exist = True 3937 else: 3938 s3xml_resource_etree = SubElement(s3xml_parent_resource_etree, 3939 "resource") 3940 else: 3941 s3xml_resource_etree = SubElement(s3xml_root_etree, 3942 "resource") 3943 3944 s3xml_resource_etree.set("name", 3945 resource.attrib.get("name", None)) 3946 3947 for field in resource: 3948 field_name = field.attrib.get("name", None) 3949 field_type = field.attrib.get("type", None) 3950 field_reference = field.attrib.get("reference") 3951 3952 if field_reference == "1": 3953 field_is_reference = True 3954 field_resource = field.attrib.get("resource") 3955 else: 3956 field_is_reference = False 3957 3958 # Create data/reference etree 3959 if field_is_reference: 3960 s3xml_reference_etree = SubElement(s3xml_resource_etree, 3961 "reference") 3962 s3xml_reference_etree.set("field", field_name) 3963 s3xml_reference_etree.set("resource", field_resource) 3964 3965 s3xml_sub_reference_etree = SubElement(s3xml_reference_etree, 3966 "resource") 3967 s3xml_sub_reference_etree.set("name", field_resource) 3968 3969 s3xml_field_etree = SubElement(s3xml_sub_reference_etree, 3970 "data") 3971 s3xml_field_etree.set("field", "name") 3972 3973 else: 3974 s3xml_field_etree = SubElement(s3xml_resource_etree, 3975 "data") 3976 s3xml_field_etree.set("field", field_name) 3977 #s3xml_field_etree.set("type", field_type) 3978 3979 components = field.getchildren() 3980 numcomponents = len(components) 3981 null_field = False 3982 if numcomponents == 0: 3983 continue 3984 else: 3985 component_type = components[0].tag 3986 if component_type in ("optionbox", "textbox"): 3987 if component_type == "optionbox": 3988 linenum = 0 3989 OCRText = [] 3990 OCRValue = [] 3991 for component in components: 3992 get = component.attrib.get 3993 comp_x = float(get("x")) 3994 comp_y = float(get("y")) 3995 comp_boxes = int(get("boxes")) 3996 comp_radius = float(get("radius")) 3997 comp_page = int(get("page")) 3998 comp_value = str(get("value")) 3999 comp_text = str(component.text) 4000 try: 4001 page_origin = images[comp_page]["markers"] 4002 except(KeyError): 4003 self.r.error(501, 4004 T("insufficient number of pages provided")) 4005 crop_box = ( 4006 int(page_origin[0][0]+\ 4007 (comp_x*\ 4008 images[comp_page]["scalefactor"]["x"])-\ 4009 comp_radius*images[comp_page]["scalefactor"]["x"]), 4010 int(page_origin[0][1]+\ 4011 (comp_y*\ 4012 images[comp_page]["scalefactor"]["y"])-\ 4013 comp_radius*images[comp_page]["scalefactor"]["y"]), 4014 int(page_origin[0][0]+\ 4015 (comp_x*\ 4016 images[comp_page]["scalefactor"]["x"])+\ 4017 comp_radius*images[comp_page]["scalefactor"]["x"]), 4018 int(page_origin[0][1]+\ 4019 (comp_y*\ 4020 images[comp_page]["scalefactor"]["y"])+\ 4021 comp_radius*images[comp_page]["scalefactor"]["y"]), 4022 ) 4023 temp_image = images[comp_page]["image"].crop(crop_box) 4024 cropped_image = images[comp_page]["image"].crop(crop_box) 4025 result = self.__ocrIt(cropped_image, 4026 form_uuid, 4027 resourcename, 4028 linenum, 4029 content_type="optionbox", 4030 resource_table=resource.attrib.get("name"), 4031 field_name=field.attrib.get("name"), 4032 field_value=comp_value) 4033 if result: 4034 OCRText.append(unicode.strip(comp_text.decode("utf-8"))) 4035 OCRValue.append(unicode.strip(comp_value.decode("utf-8"))) 4036 4037 linenum += 1 4038 4039 # Store values into xml 4040 if len(OCRValue) in [0, 1]: 4041 uOCRValue = "|".join(OCRValue) 4042 uOCRText = "|".join(OCRText) 4043 else: 4044 uOCRValue = "|%s|" % "|".join(OCRValue) 4045 uOCRText = "|%s|" % "|".join(OCRText) 4046 4047 s3xml_field_etree.set("value", uOCRValue) 4048 s3xml_field_etree.text = uOCRText 4049 4050 if len(OCRValue) == 0: 4051 null_field = True 4052 else: 4053 null_field = False 4054 4055 elif component_type == "textbox": 4056 linenum = 1 4057 if field_type in ["date", "datetime"]: 4058 # Date(Time) Text Box 4059 OCRedValues = {} 4060 comp_count = 1 4061 for component in components: 4062 get = component.attrib.get 4063 comp_x = float(get("x")) 4064 comp_y = float(get("y")) 4065 comp_boxes = int(get("boxes")) 4066 comp_side = float(get("side")) 4067 comp_page = int(get("page")) 4068 comp_meta = str(component.text) 4069 try: 4070 page_origin = images[comp_page]["markers"] 4071 except(KeyError): 4072 self.r.error(501, 4073 T("insufficient number of pages provided")) 4074 crop_box = ( 4075 int(page_origin[0][0]+\ 4076 (comp_x*\ 4077 images[comp_page]["scalefactor"]["x"])), 4078 int(page_origin[0][1]+\ 4079 (comp_y*\ 4080 images[comp_page]["scalefactor"]["y"])), 4081 int(page_origin[0][0]+\ 4082 (comp_x*\ 4083 images[comp_page]["scalefactor"]["x"])+\ 4084 comp_side*comp_boxes*images[comp_page]["scalefactor"]["x"]), 4085 int(page_origin[0][1]+\ 4086 (comp_y*\ 4087 images[comp_page]["scalefactor"]["y"])+\ 4088 comp_side*images[comp_page]["scalefactor"]["y"]), 4089 ) 4090 cropped_image = images[comp_page]["image"].crop(crop_box) 4091 output = self.__ocrIt(cropped_image, 4092 form_uuid, 4093 resourcename, 4094 linenum, 4095 resource_table=resource.attrib.get("name"), 4096 field_name=field.attrib.get("name"), 4097 field_seq=comp_count) 4098 linenum += 1 4099 comp_count += 1 4100 4101 OCRedValues[comp_meta] = unicode.strip(output.decode("utf-8")) 4102 4103 # YYYY 4104 yyyy = datetime.now().year 4105 try: 4106 if int(OCRedValues["YYYY"]) in range(1800, 2300): 4107 yyyy = int(OCRedValues["YYYY"]) 4108 except: 4109 pass 4110 4111 if yyyy % 4 == 0: 4112 leapyear = True 4113 else: 4114 leapyear = False 4115 4116 # MO 4117 try: 4118 if int(OCRedValues["MO"]) in range(1, 13): 4119 mo = int(OCRedValues["MO"]) 4120 except: 4121 mo = 1 4122 4123 # DD 4124 try: 4125 if int(OCRedValues["DD"]) in range(1, 32): 4126 dd = int(OCRedValues["DD"]) 4127 except: 4128 dd = 1 4129 4130 if mo in [4, 6, 9, 11]: 4131 if dd == 31: 4132 dd = 1 4133 elif mo == 2: 4134 if leapyear: 4135 if dd > 29: 4136 dd = 1 4137 else: 4138 if dd > 28: 4139 dd = 1 4140 4141 if field_type == "datetime": 4142 # MM 4143 try: 4144 if int(OCRedValues["MM"]) in range(0, 60): 4145 mm = int(OCRedValues["MM"]) 4146 except: 4147 mm = 0 4148 4149 # MM 4150 try: 4151 if int(OCRedValues["HH"]) in range(0, 24): 4152 hh = int(OCRedValues["HH"]) 4153 except: 4154 hh = 0 4155 4156 if field_type == "date": 4157 s3xml_field_etree.set("value", 4158 "%s-%s-%s" % (yyyy, mo, dd)) 4159 s3xml_field_etree.text =\ 4160 "%s-%s-%s" % (yyyy, mo, dd) 4161 4162 elif field_type == "datetime": 4163 utctime = self.__convert_utc(yyyy, mo, dd, hh, mm) 4164 utcftime = utctime.strftime("%Y-%m-%dT%H:%M:%SZ") 4165 s3xml_field_etree.set("value", utcftime) 4166 s3xml_field_etree.text = utcftime 4167 4168 else: 4169 # Normal Text Box 4170 ocrText = "" 4171 comp_count = 1 4172 for component in components: 4173 comp_x = float(component.attrib.get("x")) 4174 comp_y = float(component.attrib.get("y")) 4175 comp_boxes = int(component.attrib.get("boxes")) 4176 comp_side = float(component.attrib.get("side")) 4177 comp_page = int(component.attrib.get("page")) 4178 comp_meta = str(component.text) 4179 try: 4180 page_origin = images[comp_page]["markers"] 4181 except(KeyError): 4182 self.r.error(501, 4183 T("insufficient number of pages provided")) 4184 crop_box = ( 4185 int(page_origin[0][0]+\ 4186 (comp_x*\ 4187 images[comp_page]["scalefactor"]["x"])), 4188 int(page_origin[0][1]+\ 4189 (comp_y*\ 4190 images[comp_page]["scalefactor"]["y"])), 4191 int(page_origin[0][0]+\ 4192 (comp_x*\ 4193 images[comp_page]["scalefactor"]["x"])+\ 4194 comp_side*comp_boxes*images[comp_page]["scalefactor"]["x"]), 4195 int(page_origin[0][1]+\ 4196 (comp_y*\ 4197 images[comp_page]["scalefactor"]["y"])+\ 4198 comp_side*images[comp_page]["scalefactor"]["y"]), 4199 ) 4200 cropped_image = images[comp_page]["image"].crop(crop_box) 4201 output = self.__ocrIt(cropped_image, 4202 form_uuid, 4203 resourcename, 4204 linenum, 4205 resource_table=resource.attrib.get("name"), 4206 field_name=field.attrib.get("name"), 4207 field_seq=comp_count) 4208 ocrText += output 4209 linenum += 1 4210 comp_count += 1 4211 4212 output = unicode.strip(ocrText.decode("utf-8")) 4213 # Store OCRText 4214 if field_type in ["double", "integer"]: 4215 try: 4216 output = int(self.__strip_spaces(output)) 4217 except: 4218 output = 0 4219 s3xml_field_etree.set("value", 4220 "%s" % output) 4221 s3xml_field_etree.text =\ 4222 "%s" % output 4223 else: 4224 s3xml_field_etree.text = output 4225 4226 if len("%s" % output) == 0: 4227 null_field = True 4228 else: 4229 null_field = False 4230 4231 else: 4232 continue 4233 4234 if null_field: 4235 if field_is_reference: 4236 s3xml_resource_etree.remove(s3xml_reference_etree) 4237 4238 else: 4239 s3xml_resource_etree.remove(s3xml_field_etree) 4240 4241 output = etree.tostring(s3xml_root_etree, pretty_print=True) 4242 return output

4243 4244 # -------------------------------------------------------------------------

4245 - def __strip_spaces(self, text):

4246 """ 4247 Remove all spaces from a string 4248 """ 4249 4250 try: 4251 text = "".join(text.split()) 4252 except: 4253 pass 4254 4255 return text

4256 4257 # -------------------------------------------------------------------------

4258 - def __convert_utc(self, 4259 yyyy, 4260 mo, 4261 dd, 4262 hh, 4263 mm):

4264 """ 4265 Convert local time to UTC 4266 """ 4267 4268 timetuple = datetime.strptime("%s-%s-%s %s:%s:00" % (yyyy, 4269 mo, 4270 dd, 4271 hh, 4272 mm), 4273 "%Y-%m-%d %H:%M:%S") 4274 auth = current.auth 4275 if auth.user: 4276 utc_offset = auth.user.utc_offset 4277 else: 4278 utc_offset = None 4279 try: 4280 t = utc_offset.split()[1] 4281 if len(t) == 5: 4282 sign = t[0] 4283 hours = t[1:3] 4284 minutes = t[3:5] 4285 tdelta = timedelta(hours=int(hours), minutes=int(minutes)) 4286 if sign == "+": 4287 utctime = timetuple - tdelta 4288 elif sign == "-": 4289 utctime = timetuple + tdelta 4290 except: 4291 utctime = timetuple 4292 4293 return utctime

4294 4295 # -------------------------------------------------------------------------

4296 - def __ocrIt(self, 4297 image, 4298 form_uuid, 4299 resourcename, 4300 linenum, 4301 content_type="textbox", 4302 **kwargs):

4303 """ 4304 Put Tesseract to work, actual OCRing will be done here 4305 """ 4306 4307 db = current.db 4308 ocr_field_crops = "ocr_field_crops" 4309 import uuid 4310 uniqueuuid = uuid.uuid1() # to make it thread safe 4311 4312 resource_table = kwargs.get("resource_table") 4313 field_name = kwargs.get("field_name") 4314 4315 inputfilename = "%s_%s_%s_%s.tif" % (uniqueuuid, 4316 form_uuid, 4317 resourcename, 4318 linenum) 4319 outputfilename = "%s_%s_%s_%s_text" % (uniqueuuid, 4320 form_uuid, 4321 resourcename, 4322 linenum) 4323 4324 ocr_temp_dir = os.path.join(self.r.folder, "uploads", "ocr_temp") 4325 4326 try: 4327 os.mkdir(ocr_temp_dir) 4328 except(OSError): 4329 pass 4330 4331 if content_type == "optionbox": 4332 field_value = kwargs.get("field_value") 4333 imgfilename = "%s.png" % inputfilename[:-3] 4334 imgpath = os.path.join(ocr_temp_dir, imgfilename) 4335 image.save(imgpath) 4336 imgfile = open(imgpath, "r") 4337 db[ocr_field_crops].insert(image_set_uuid=self.set_uuid, 4338 resource_table=resource_table, 4339 field_name=field_name, 4340 image_file=db[ocr_field_crops]["image_file"].store(imgfile, 4341 imgfilename), 4342 value=field_value) 4343 imgfile.close() 4344 os.remove(imgpath) 4345 4346 stat = ImageStat.Stat(image) 4347 if stat.mean[0] < 96 : 4348 return True 4349 else: 4350 return None 4351 4352 elif content_type == "textbox": 4353 field_seq = kwargs.get("field_seq") 4354 4355 inputpath = os.path.join(ocr_temp_dir, inputfilename) 4356 image.save(inputpath) 4357 4358 success =\ 4359 subprocess.call(["tesseract", inputpath, 4360 os.path.join(ocr_temp_dir, outputfilename)]) 4361 if success != 0: 4362 self.r.error(501, ERROR.TESSERACT_ERROR) 4363 outputpath = os.path.join(ocr_temp_dir, "%s.txt" % outputfilename) 4364 outputfile = open(outputpath) 4365 outputtext = outputfile.read() 4366 outputfile.close() 4367 output = outputtext.replace("\n", " ") 4368 os.remove(outputpath) 4369 imgfilename = "%s.png" % inputfilename[:-3] 4370 imgpath = os.path.join(ocr_temp_dir, imgfilename) 4371 image.save(imgpath) 4372 imgfile = open(imgpath, "r") 4373 db[ocr_field_crops].insert(image_set_uuid=self.set_uuid, 4374 resource_table=resource_table, 4375 field_name=field_name, 4376 image_file=db[ocr_field_crops]["image_file"].store(imgfile, 4377 imgfilename), 4378 sequence=field_seq) 4379 imgfile.close() 4380 os.remove(imgpath) 4381 os.remove(inputpath) 4382 4383 try: 4384 os.rmdir(ocr_temp_dir) 4385 except(OSError): 4386 import shutil 4387 shutil.rmtree(ocr_temp_dir) 4388 return output

4389 4390 # -------------------------------------------------------------------------

4391 - def __convertImage2binary(self, image, threshold = 180):

4392 """ 4393 Converts the image into binary based on a threshold. here it is 180 4394 """ 4395 4396 image = ImageOps.grayscale(image) 4397 image.convert("L") 4398 4399 width, height = image.size 4400 4401 for x in xrange(width): 4402 for y in xrange(height): 4403 if image.getpixel((x,y)) < 180 : 4404 image.putpixel((x,y), 0) 4405 else: 4406 image.putpixel((x,y), 255) 4407 return image

4408 4409 # -------------------------------------------------------------------------

4410 - def __findRegions(self, im):

4411 """ 4412 Return the list of regions which are found by the following algorithm. 4413 4414 ----------------------------------------------------------- 4415 Raster Scanning Algorithm for Connected Component Analysis: 4416 ----------------------------------------------------------- 4417 4418 On the first pass: 4419 ================= 4420 1. Iterate through each element of the data by column, then by row (Raster Scanning) 4421 2. If the element is not the background 4422 1. Get the neighboring elements of the current element 4423 2. If there are no neighbors, uniquely label the current element and continue 4424 3. Otherwise, find the neighbor with the smallest label and assign it to the current element 4425 4. Store the equivalence between neighboring labels 4426 4427 On the second pass: 4428 =================== 4429 1. Iterate through each element of the data by column, then by row 4430 2. If the element is not the background 4431 1. Relabel the element with the lowest equivalent label 4432 ( source: http://en.wikipedia.org/wiki/Connected_Component_Labeling ) 4433 """ 4434 4435 width, height = im.size 4436 ImageOps.grayscale(im) 4437 im = im.convert("L") 4438 4439 regions = {} 4440 pixel_region = [[0 for y in xrange(height)] for x in xrange(width)] 4441 equivalences = {} 4442 n_regions = 0 4443 4444 # First pass: find regions. 4445 for x in xrange(width): 4446 for y in xrange(height): 4447 # Look for a black pixel 4448 if im.getpixel((x, y)) == 0 : # BLACK 4449 # get the region number from north or west or create new region 4450 region_n = pixel_region[x-1][y] if x > 0 else 0 4451 region_w = pixel_region[x][y-1] if y > 0 else 0 4452 #region_nw = pixel_region[x-1][y-1] if x > 0 and y > 0 else 0 4453 #region_ne = pixel_region[x-1][y+1] if x > 0 else 0 4454 4455 max_region = max(region_n, region_w) 4456 4457 if max_region > 0: 4458 #a neighbour already has a region, new region is the smallest > 0 4459 new_region = min(filter(lambda i: i > 0, (region_n, region_w))) 4460 #update equivalences 4461 if max_region > new_region: 4462 if max_region in equivalences: 4463 equivalences[max_region].add(new_region) 4464 else: 4465 equivalences[max_region] = set((new_region, )) 4466 else: 4467 n_regions += 1 4468 new_region = n_regions 4469 4470 pixel_region[x][y] = new_region 4471 4472 # Scan image again, assigning all equivalent regions the same region value. 4473 for x in xrange(width): 4474 for y in xrange(height): 4475 r = pixel_region[x][y] 4476 if r > 0: 4477 while r in equivalences: 4478 r = min(equivalences[r]) 4479 4480 if r in regions: 4481 regions[r].add(x, y) 4482 else: 4483 regions[r] = self.__Region(x, y) 4484 4485 return list(regions.itervalues())

4486 4487 # -------------------------------------------------------------------------

4488 - def __getOrientation(self, markers):

4489 """ 4490 Returns orientation of the sheet in radians 4491 """ 4492 4493 x1, y1 = markers[0] 4494 x2, y2 = markers[2] 4495 try: 4496 slope = ((x2 - x1) * 1.0) / ((y2 - y1) * 1.0) 4497 except(ZeroDivisionError): 4498 slope = 999999999999999999999999999 4499 return math.atan(slope) * (180.0 / math.pi) * (-1)

4500 4501 # -------------------------------------------------------------------------

4502 - def __scaleFactor(self, markers):

4503 """ 4504 Returns the scale factors lengthwise and breadthwise 4505 """ 4506 4507 stdWidth = sum((596, -60)) 4508 stdHeight = sum((842, -60)) 4509 li = [markers[0], markers[2]] 4510 sf_y = self.__distance(li)/stdHeight 4511 li = [markers[6], markers[2]] 4512 sf_x = self.__distance(li)/stdWidth 4513 return {"x": sf_x, 4514 "y": sf_y 4515 }

4516 4517 # -------------------------------------------------------------------------

4518 - def __distance(self, li):

4519 """ 4520 Returns the euclidean distance if the input is of the form [(x1, y1), (x2, y2)] 4521 """ 4522 4523 return math.sqrt(math.fsum((math.pow(math.fsum((int(li[1][0]), -int(li[0][0]))), 2), math.pow(math.fsum((int(li[1][1]), -int(li[0][1]))), 2))))

4524 4525 # -------------------------------------------------------------------------

4526 - def __getMarkers(self, image):

4527 """ 4528 Gets the markers on the OCR image 4529 """ 4530 4531 centers = {} 4532 present = 0 4533 4534 regions = self.__findRegions(image) 4535 4536 for r in regions: 4537 if r.area > 320 and r.aspectratio() < 1.5 and r.aspectratio() > 0.67: 4538 present += 1 4539 centers[present] = r.centroid() 4540 4541 # This is the list of all the markers on the form. 4542 markers = list(centers.itervalues()) 4543 markers.sort() 4544 l1 = sorted(markers[0:3], key=lambda y: y[1]) 4545 l2 = markers[3:4] 4546 l3 = sorted(markers[4:7], key=lambda y: y[1]) 4547 markers = [] 4548 markers.extend(l1) 4549 markers.extend(l2) 4550 markers.extend(l3) 4551 #markers.sort(key=lambda x: (x[0], x[1])) 4552 return markers

4553 4554 # =========================================================================

4555 - class __Region():

4556 """ 4557 """ 4558

4559 - def __init__(self, x, y):

4560 """ Initialize the region """ 4561 self._pixels = [(x, y)] 4562 self._min_x = x 4563 self._max_x = x 4564 self._min_y = y 4565 self._max_y = y 4566 self.area = 1

4567 4568 # ---------------------------------------------------------------------

4569 - def add(self, x, y):

4570 """ Add a pixel to the region """ 4571 self._pixels.append((x, y)) 4572 self.area += 1 4573 self._min_x = min(self._min_x, x) 4574 self._max_x = max(self._max_x, x) 4575 self._min_y = min(self._min_y, y) 4576 self._max_y = max(self._max_y, y)

4577 4578 # ---------------------------------------------------------------------

4579 - def centroid(self):

4580 """ Returns the centroid of the bounding box """ 4581 return ((self._min_x + self._max_x) / 2, 4582 (self._min_y + self._max_y) / 2)

4583 4584 # ---------------------------------------------------------------------

4585 - def box(self):

4586 """ Returns the bounding box of the region """ 4587 return [ (self._min_x, self._min_y) , (self._max_x, self._max_y)]

4588 4589 # ---------------------------------------------------------------------

4590 - def aspectratio(self):

4591 """ Calculating the aspect ratio of the region """ 4592 width = self._max_x - self._min_x 4593 length = self._max_y - self._min_y 4594 return float(width)/float(length)

4595 4596 # END ========================================================================= 4597

Source Code for Module s3.s3pdf