1
2
3 """ Resource PDF Tools
4
5 @see: U{B{I{S3XRC}} <http://eden.sahanafoundation.org/wiki/S3XRC>}
6
7 @requires: U{B{I{ReportLab}} <http://www.reportlab.com>}
8
9 ######################################################################
10 DEPRECATION WARNING
11
12 This class is being replaced by the S3RL_PDF codec
13
14 Initially the reporting features will be replaced, with the OCR
15 process being removed at a later stage.
16 ######################################################################
17
18 @copyright: 2011-2019 (c) Sahana Software Foundation
19 @license: MIT
20
21 Permission is hereby granted, free of charge, to any person
22 obtaining a copy of this software and associated documentation
23 files (the "Software"), to deal in the Software without
24 restriction, including without limitation the rights to use,
25 copy, modify, merge, publish, distribute, sublicense, and/or sell
26 copies of the Software, and to permit persons to whom the
27 Software is furnished to do so, subject to the following
28 conditions:
29
30 The above copyright notice and this permission notice shall be
31 included in all copies or substantial portions of the Software.
32
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
35 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
36 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
37 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
38 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
39 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
40 OTHER DEALINGS IN THE SOFTWARE.
41 """
42
43 __all__ = ("S3PDF",)
44
45 import json
46 import math
47 import os
48 import re
49 import sys
50 import subprocess
51 import unicodedata
52
53 from copy import deepcopy
54 try:
55 from cStringIO import StringIO
56 except:
57 from StringIO import StringIO
58 from datetime import datetime, timedelta, date
59
60
61
62 from htmlentitydefs import name2codepoint
63
64 from gluon import *
65 from gluon.storage import Storage
66 from gluon.contenttype import contenttype
67 from gluon.languages import lazyT
68
69 try:
70 from lxml import etree
71 except ImportError:
72 sys.stderr.write("ERROR: lxml module needed for XML handling\n")
73 raise
74
75 from s3datetime import S3DateTime
76 from s3rest import S3Method
77 from s3utils import s3_represent_value, s3_validate
78 import s3codec
79
80 try:
81 from PIL import Image
82 from PIL import ImageOps
83 from PIL import ImageStat
84 PILImported = True
85 except(ImportError):
86 try:
87 import Image
88 import ImageOps
89 import ImageStat
90 PILImported = True
91 except(ImportError):
92 sys.stderr.write("S3 Debug: S3PDF: Python Image Library not installed\n")
93 PILImported = False
94 try:
95 from reportlab.lib.enums import TA_CENTER, TA_RIGHT
96 from reportlab.pdfbase import pdfmetrics
97
98 from reportlab.pdfgen import canvas
99 from reportlab.lib.fonts import tt2ps
100 from reportlab.rl_config import canvas_basefontname as _baseFontName
101 from reportlab.platypus import BaseDocTemplate, SimpleDocTemplate, PageTemplate
102 from reportlab.platypus.frames import Frame
103 from reportlab.platypus import Spacer, PageBreak, Paragraph
104 from reportlab.platypus import Table, TableStyle
105 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
106 from reportlab.lib.units import inch
107 from reportlab.lib.units import cm
108 from reportlab.lib import colors
109 from reportlab.lib.colors import Color
110 from reportlab.lib.pagesizes import A4, LETTER, landscape, portrait
111 from reportlab.platypus.flowables import Flowable
112 reportLabImported = True
113 except ImportError:
114 sys.stderr.write("S3 Debug: S3PDF: Reportlab not installed\n")
115 reportLabImported = False
116
117
118 MAX_FORM_OPTIONS_LIMIT = 12
119
120
121 ERROR = Storage()
125 T = current.T
126 ERROR = Storage(
127 PIL_ERROR=T("PIL (Python Image Library) not installed"),
128 REPORTLAB_ERROR=T("ReportLab not installed"),
129 )
130
131 if not reportLabImported:
132 r.error(501, ERROR.REPORTLAB_ERROR)
133 if not PILImported:
134 r.error(501, ERROR.PIL_ERROR)
135
136
137
138
139
140
141 if reportLabImported:
142
143
144 - class ChangePageTitle(Flowable):
145 - def __init__(self, doc, newTitle):
146 Flowable.__init__(self)
147 self.doc = doc
148 self.title = newTitle
149
151 self.doc.title = self.title
152
159
161 self.function(self.canv, self.data)
162
165 """
166 The standard document template for eden reports
167 It allows for the following page templates:
168 1) First Page
169 2) Even Page
170 3) Odd Page
171 4) Landscape Page
172 """
173
174
175 - def setPageTemplates(self,
176 first,
177 firstEnd,
178 even = None,
179 odd = None,
180 landscape = None,
181 ):
182 """
183 Determine which page template to use
184 """
185
186 self.onfirst = first
187 self.onfirstEnd = firstEnd
188 if even:
189 self.oneven = even
190 else:
191 self.oneven = first
192 if odd:
193 self.onodd = odd
194 else:
195 self.onodd = first
196 if landscape:
197 self.onlandscape = landscape
198 else:
199 self.onlandscape = first
200 self.needLandscape = False
201
202
204 """
205 Determine which page template to use
206 """
207
208 self._handle_pageBegin()
209 if self.needLandscape:
210 self._handle_nextPageTemplate("landscape")
211 elif self.page %2 == 1:
212 self._handle_nextPageTemplate("odd")
213 else:
214 self._handle_nextPageTemplate("even")
215
216
217 - def build(self, flowables, canvasmaker=canvas.Canvas):
218 """
219 Build the document using the flowables.
220
221 Set up the page templates that the document can use
222
223 """
224
225 self._calc()
226 showBoundary = 0
227 frameT = Frame(self.leftMargin,
228 self.bottomMargin,
229 self.width,
230 self.height,
231 id="body",
232 showBoundary = showBoundary)
233 self.addPageTemplates([PageTemplate(id="first",
234 frames=frameT,
235 onPage=self.onfirst,
236 onPageEnd=self.onfirstEnd,
237 pagesize=self.pagesize),
238 PageTemplate(id="even",
239 frames=frameT,
240 onPage=self.oneven,
241 onPageEnd=self.onfirstEnd,
242 pagesize=self.pagesize),
243 PageTemplate(id="odd",
244 frames=frameT,
245 onPage=self.onodd,
246 onPageEnd=self.onfirstEnd,
247 pagesize=self.pagesize),
248 PageTemplate(id="landscape",
249 frames=frameT,
250 onPage=self.onlandscape,
251 pagesize=self.pagesize),
252 ])
253 BaseDocTemplate.build(self, flowables, canvasmaker=canvasmaker)
254
255
256 -class S3PDF(S3Method):
257 """
258 Class to help generate PDF documents.
259
260 A typical implementation would be as follows:
261
262 exporter = s3base.S3PDF()
263 return exporter(xrequest, **attr)
264
265 Currently this class supports two types of reports:
266 A List: Typically called from the icon shown in a search
267 For example inv/warehouse
268 A Header plus List: Typically called from a button on a form
269 For example ???
270
271 Add additional generic forms to the apply_method() function
272 For specialist forms a S3PDF() object will need to be created.
273 See the apply_method() for ideas on how to create a form,
274 but as a minimum the following structure is required:
275
276 pdf = S3PDF()
277 pdf.newDocument(pdf.defaultTitle(resource))
278
279 # Add specific pages here
280
281 return pdf.buildDoc()
282 """
283
284
286 """
287 Apply CRUD methods
288
289 @param r: the S3Request
290 @param attr: dictionary of parameters for the method handler
291 The attributes that it knows about are:
292 * componentname
293 * formname
294 * list_fields
295 * report_groupby
296 * report_hide_comments
297
298 @return: output object to send to the view
299 """
300
301
302 def getParam(key):
303 """
304 nested function to get the parameters passed into apply_method
305
306 @todo find out if this has been done better elsewhere! :(
307
308 This will first try and get the argument from the attr parameter,
309 if it's not here then try self._config()
310 """
311 value = attr.get(key)
312 if value != None:
313 return value
314 return self._config(key)
315
316 T = current.T
317 self.ERROR = ERROR = Storage(
318 NO_RECORDS=T("No records in this resource. Add one more records manually and then retry."),
319 TESSERACT_ERROR=T("%(app)s not installed. Ask the Server Administrator to install on Server.") % dict(app="Tesseract 3.01"),
320 EMPTY_OCR_FORM=T("Selected OCR Form has no pages. Use another revision of create a new revision by downloading a new Form."),
321 INVALID_IMAGE_TYPE=T("Uploaded file(s) are not Image(s). Supported image formats are '.png', '.jpg', '.bmp', '.gif'."),
322 OCR_DISABLED=T("OCR module is disabled. Ask the Server Administrator to enable it."),
323 IMAGE_MAGICK_ERROR=T("%(app)s not installed. Ask the Server Administrator to install on Server.") % dict(app="ImageMagick"),
324 NOT_PDF_FILE=T("Uploaded file is not a PDF file. Provide a Form in valid PDF Format."),
325 INVALID_PDF=T("Uploaded PDF file has more/less number of page(s) than required. Check if you have provided appropriate revision for your Form as well as check the Form contains appropriate number of pages."),
326 NO_UTC_OFFSET=T("No UTC offset found. Please set UTC offset in your 'User Profile' details. Example: UTC+0530"),
327 INVALID_JOBID=T("The provided 'jobuuid' is invalid. The session of Form upload is invalid. You should retry uploading."),
328 INVALID_FORMID=T("The provided 'formuuid' is invalid. You have selected a Form revision which does not exist on this server."),
329 UNRECOVERABLE_ERROR=T("The uploaded Form is unreadable, please do manual data entry."),
330 JOB_COMPLETE=T("This job has already been finished successfully."),
331 )
332
333 self.r = r
334 checkDependencies(r)
335 settings = current.deployment_settings
336 request = current.request
337 response = current.response
338 session = current.session
339 db = current.db
340
341 if DEBUG:
342 content_disposition = "inline"
343 else:
344 content_disposition = "attachment"
345
346 if settings.get_pdf_size() == "Letter":
347 self.paper_size = LETTER
348 else:
349 self.paper_size = A4
350
351 try:
352 self.logo = os.path.join(request.folder,
353 settings.get_pdf_logo())
354 except:
355 self.logo = None
356 self.headerBanner = None
357
358 method = self.method
359
360 callback = getParam("callback")
361 if callback != None:
362 title = getParam("formname")
363 if title == None:
364 title = self.defaultTitle(self.resource)
365 header = getParam("header")
366 if header == None:
367 header = self.pageHeader
368 footer = getParam("footer")
369 if footer == None:
370 footer = self.pageFooter
371 filename = getParam("filename")
372 if filename == None:
373 filename = title
374 self.newDocument(title,
375 header=header,
376 footer=footer,
377 filename = filename)
378 try:
379 id = r.component_id
380 if id == None:
381 id = r.id
382 except:
383 try:
384 id = r.id
385 except:
386 id = None
387
388 callback(self, id=id)
389
390 doc = self.buildDoc()
391
392 if response:
393 response.headers["Content-Type"] = contenttype(".pdf")
394 response.headers["Content-disposition"] = \
395 "%s; filename=\"%s\"" % (content_disposition,
396 self.filename)
397
398
399 return doc
400
401 elif r.http == "GET":
402 if self.method in ("read", "list"):
403
404
405 componentname = getParam("componentname")
406 title = getParam("formname")
407 list_fields = getParam("list_fields")
408 report_groupby = getParam("report_groupby")
409 report_hide_comments = getParam("report_hide_comments")
410 filename = getParam("filename")
411 if filename == None:
412 filename = title
413
414
415 if title == None:
416 title = self.defaultTitle(self.resource)
417 self.newDocument(title,
418 header=self.pageHeader,
419 footer=self.pageFooter,
420 filename = filename)
421
422 if "report_landscape" in attr:
423 self.setLandscape()
424
425 if "rheader" in attr and attr["rheader"]:
426 self.extractrHeader(attr["rheader"])
427 self.addSpacer(3)
428 elif componentname:
429 self.addrHeader(self.resource,
430 list_fields,
431 report_hide_comments=report_hide_comments)
432 self.addSpacer(3)
433
434 if componentname == None:
435
436 self.addTable(self.resource,
437 list_fields=list_fields,
438 report_groupby=report_groupby,
439 report_hide_comments=report_hide_comments)
440 else:
441
442
443 ptable = self.resource.table
444 ctable = db[componentname]
445 raw_data = []
446 linkfield = None
447 for link in ptable._referenced_by:
448 if link[0] == componentname:
449 linkfield = link[1]
450 break
451 if linkfield != None:
452 query = ctable[linkfield] == self.record_id
453 records = db(query).select()
454 find_fields = []
455 for component in self.resource.components.values():
456 find_fields += component.readable_fields()
457 fields = []
458 if list_fields:
459 for lf in list_fields:
460 for field in find_fields:
461 if field.name == lf:
462 fields.append(field)
463 break
464 else:
465 for field in find_fields:
466 if field.type == "id":
467 continue
468 if report_hide_comments and field.name == "comments":
469 continue
470 fields.append(field)
471 if not fields:
472 fields = [table.id]
473 label_fields = [f.label for f in fields]
474
475 for record in records:
476 data = []
477 for field in fields:
478 value = record[field.name]
479 text = s3_represent_value(field,
480 value=value,
481 strip_markup=True,
482 non_xml_output=True,
483 extended_comments=True
484 )
485 data.append(text)
486 raw_data.append(data)
487 self.addTable(raw_data = raw_data,
488 list_fields=label_fields)
489
490 if "report_footer" in attr:
491 self.addSpacer(3)
492 self.extractrHeader(attr["report_footer"])
493
494 doc = self.buildDoc()
495
496
497 if response:
498 response.headers["Content-Type"] = contenttype(".pdf")
499 response.headers["Content-disposition"] = \
500 "%s; filename=\"%s\"" % (content_disposition,
501 self.filename)
502
503
504 return doc
505
506 elif method == "create":
507 if current.deployment_settings.has_module("ocr"):
508
509 import uuid
510 formUUID = uuid.uuid1()
511 self.newOCRForm(formUUID)
512
513
514 self.OCRPDFManager()
515
516
517 doc = self.buildDoc()
518 numPages = self.doc.numPages
519 layoutXML = self.__getOCRLayout()
520 self.__update_dbmeta(formUUID, layoutXML, numPages)
521
522
523 if response:
524 response.headers["Content-Type"] = contenttype(".pdf")
525 response.headers["Content-disposition"] = \
526 "%s; filename=\"%s\"" % (content_disposition,
527 self.filename)
528
529
530 return doc
531
532 else:
533
534 r.error(501, self.ERROR.OCR_DISABLED)
535
536 elif method == "import":
537
538 if not current.deployment_settings.has_module("ocr"):
539 r.error(501, self.ERROR.OCR_DISABLED)
540
541 authorised = self._permitted(method="create")
542 if not authorised:
543 r.unauthorised()
544
545 try:
546 if r.component:
547 trigger = r.args[3]
548 else:
549 trigger = r.args[1]
550 except(IndexError):
551 trigger = None
552
553 if trigger == "review":
554 try:
555 jobuuid = r.vars["jobuuid"]
556 except(KeyError):
557 r.error(501, current.ERROR.BAD_REQUEST)
558
559
560 current.s3db.table("ocr_meta")
561 statustable = db.ocr_form_status
562 query = (statustable.job_uuid == jobuuid)
563 row = db(query).select(statustable.review_status,
564 statustable.job_has_errors,
565 statustable.image_set_uuid,
566 statustable.form_uuid,
567 limitby=(0, 1)).first()
568 if not row:
569
570 r.error(501, self.ERROR.INVALID_JOBID)
571
572 if row.review_status == 1:
573
574 r.error(501, self.ERROR.JOB_COMPLETE)
575
576
577 if row.job_has_errors == 1:
578 job_has_errors = True
579 else:
580 job_has_errors = False
581
582 self.setuuid = row.image_set_uuid
583
584
585 formuuid = row.form_uuid
586 metatable = db.ocr_meta
587 row = db(metatable.form_uuid == formuuid).select(metatable.s3ocrxml_file,
588 limitby=(0, 1)).first()
589 if not row:
590 r.error(501, self.ERROR.INVALID_FORMID)
591
592 s3ocrxml_filename = row.s3ocrxml_file
593 f = open(os.path.join(r.folder,
594 "uploads",
595 "ocr_meta",
596 s3ocrxml_filename),
597 "rb")
598 s3ocrxml = f.read()
599 f.close()
600
601 s3ocrdict = self.__s3ocrxml2dict(s3ocrxml)
602
603
604 import_job = self.resource.import_tree(None, None,
605 job_id=jobuuid,
606 commit_job=False,
607 ignore_errors=True)
608
609 s3import_enabled = True
610 if s3import_enabled:
611 s3ocrdata = self.__importjob2data(import_job)
612
613 else:
614
615 table = db.ocr_data_xml
616 query = (table.image_set_uuid == self.setuuid)
617 row = db(query).select(table.data_file,
618 limitby=(0, 1)).first()
619
620 if not row:
621 r.error(501, current.ERROR.BAD_RECORD)
622
623 s3ocrdataxml_filename = row.data_file
624 f = open(os.path.join(r.folder,
625 "uploads",
626 "ocr_payload",
627 s3ocrdataxml_filename),
628 "rb")
629 s3ocrdataxml = f.read()
630 f.close()
631
632 s3ocrdata = self.__temp_ocrdataxml_parser(s3ocrdataxml)
633
634 reviewform = self.__create_review_form(s3ocrdict, s3ocrdata)
635
636 return response.render("_ocr_review.html",
637 dict(reviewform=reviewform)
638 )
639
640 elif trigger == "image":
641
642 try:
643 setuuid = r.vars["setuuid"]
644 resource_table = r.vars["resource_table"]
645 field_name = r.vars["field_name"]
646 except(KeyError):
647 r.error(501, current.ERROR.BAD_REQUEST)
648
649 try:
650 value = r.vars["value"]
651 except(KeyError):
652 value = None
653 try:
654 sequence = r.vars["sequence"]
655 except(KeyError):
656 r.error(501, current.ERROR.BAD_REQUEST)
657
658
659 current.s3db.table("ocr_meta")
660 table = db.ocr_field_crops
661 if value:
662 query = (table.image_set_uuid == setuuid) & \
663 (table.resource_table == resource_table) & \
664 (table.field_name == field_name) & \
665 (table.value == value)
666 row = db(query).select(table.image_file,
667 limitby=(0, 1)).first()
668 else:
669 query = (table.image_set_uuid == setuuid) & \
670 (table.resource_table == resource_table) & \
671 (table.field_name == field_name) & \
672 (table.sequence == sequence)
673 row = db(query).select(table.image_file,
674 limitby=(0, 1)).first()
675 if not row:
676 r.error(501, current.ERROR.BAD_RECORD)
677
678 format = row.image_file[-4:]
679 image_file = open(os.path.join(r.folder,
680 "uploads",
681 "ocr_payload",
682 row.image_file))
683 image_file_content = image_file.read()
684 image_file.close()
685
686 if response:
687 response.headers["Content-Type"] = contenttype(format)
688 response.headers["Content-disposition"] = \
689 "%s; filename=\"%s\"" % ("inline",
690 "tempimage%s" % format)
691
692
693 return image_file_content
694
695 elif trigger == "import":
696
697 try:
698 setuuid = r.vars["setuuid"]
699 except(KeyError):
700 r.error(501, current.ERROR.BAD_REQUEST)
701
702
703 statustable = s3db.ocr_form_status
704 query = (statustable.image_set_uuid == setuuid)
705 row = db(query).select(statustable.job_uuid,
706 limitby=(0, 1)).first()
707 if row:
708
709 jobuuid = row.job_uuid
710
711 if r.component:
712
713 request_args = request.get("args", ["", ""])
714 record_id = request_args[0]
715 component_name = request_args[1]
716 urlprefix = "%s/%s/%s" % (request.function,
717 record_id,
718 component_name)
719 else:
720
721 urlprefix = request.function
722
723 redirect(URL(request.controller,
724 "%s/upload.pdf" % urlprefix,
725 args="review",
726 vars={"jobuuid":jobuuid}))
727
728 table = db.ocr_data_xml
729 row = db(table.image_set_uuid == setuuid).select(table.data_file,
730 table.form_uuid,
731 limitby=(0, 1)
732 ).first()
733 if not row:
734 r.error(501, current.ERROR.BAD_RECORD)
735
736 data_file = open(os.path.join(r.folder,
737 "uploads",
738 "ocr_payload",
739 row.data_file))
740 formuuid = row.form_uuid
741
742 datafile_content = data_file.read()
743 data_file.close()
744
745 metatable = db.ocr_meta
746 row = db(metatable.form_uuid == formuuid).select(metatable.s3ocrxml_file,
747 limitby=(0, 1)
748 ).first()
749 if not row:
750 r.error(501, self.ERROR.INVALID_FORMID)
751
752 s3ocrxml_filename = row.s3ocrxml_file
753 f = open(os.path.join(r.folder,
754 "uploads",
755 "ocr_meta",
756 s3ocrxml_filename),
757 "rb")
758 s3ocrxml = f.read()
759 f.close()
760
761 s3ocrdict = self.__s3ocrxml2dict(s3ocrxml)
762 crosslimit_options = {}
763 for resourcename in s3ocrdict["$resource_seq"]:
764 resource = s3ocrdict[resourcename]
765 for fieldname in resource["$field_seq"]:
766 field = resource[fieldname]
767 if field.has_options:
768 if field.options and \
769 field.options.count > MAX_FORM_OPTIONS_LIMIT:
770 if not crosslimit_options.has_key(resourcename):
771 crosslimit_options[resourcename] = [fieldname]
772 else:
773 crosslimit_options[resourcename].append(fieldname)
774
775 if len(crosslimit_options) != 0:
776 s3xml_root = etree.fromstring(datafile_content)
777 resource_element = s3xml_root.getchildren()[0]
778 resourcename = resource_element.attrib.get("name")
779 for field in resource_element:
780 if field.tag == "data":
781 if crosslimit_options.has_key(resourcename):
782 fieldname = field.attrib.get("field")
783 if fieldname in crosslimit_options[resourcename]:
784 match_status = {}
785 value = field.text.encode("utf-8").lower()
786 for option in s3ocrdict[resourcename][fieldname].options.list:
787 try:
788 fieldtext = option.label.lower()
789 except:
790 fieldtext = ""
791 match_status[option.value] =\
792 self.dameraulevenshtein(cast2ascii(fieldtext),
793 cast2ascii(value))
794
795 closematch_value = 1000000000
796 closematch = []
797
798 for match in match_status.keys():
799 if match_status[match] < closematch_value:
800 closematch = [match]
801 closematch_value = match_status[match]
802 elif match_status[match] == closematch_value:
803 closematch.append(match)
804
805 if len(closematch) > 0:
806 value = closematch[0]
807 else:
808 value = ""
809
810 field.text = value
811 field.attrib["value"] = value
812
813
814 elif field.tag == "resource":
815 resourcename = field.attrib.get("name")
816 for subfield in field:
817 if subfield.tag == "data":
818 fieldname = subfield.attrib.get("field")
819 if resourcename in crosslimit_options.keys() and\
820 fieldname in crosslimit_options[resourcename]:
821 match_status = {}
822 value = subfield.text.encode("utf-8").lower()
823 for option in s3ocrdict[resourcename][fieldname].options.list:
824 try:
825 fieldtext = option.label.lower()
826 except:
827 fieldtext = ""
828 match_status[option.value] =\
829 self.dameraulevenshtein(cast2ascii(fieldtext),
830 cast2ascii(value))
831
832 closematch_value = 1000000000
833 closematch = []
834
835 for match in match_status.keys():
836 if match_status[match] < closematch_value:
837 closematch = [match]
838 closematch_value = match_status[match]
839 elif match_status[match] == closematch_value:
840 closematch.append(match)
841
842 if len(closematch) > 0:
843 value = closematch[0]
844 else:
845 value = ""
846
847 subfield.text = value
848 subfield.attrib["value"] = value
849
850 datafile_content = etree.tostring(s3xml_root)
851
852
853 outputjson = self.resource.import_xml(StringIO(datafile_content),
854 commit_job=False,
855 ignore_errors=True)
856
857
858 jobuuid = self.resource.job.job_id
859 json2dict = json.loads(outputjson, strict=False)
860
861 if json2dict.has_key("message"):
862 jobhaserrors = 1
863 else:
864 jobhaserrors = 0
865
866
867 if json2dict.get("statuscode") != "200":
868 r.error(501, self.ERROR.UNRECOVERABLE_ERROR)
869
870
871 db.ocr_form_status.insert(image_set_uuid=setuuid,
872 form_uuid=formuuid,
873 job_uuid=jobuuid,
874 job_has_errors=jobhaserrors)
875
876 if r.component:
877 request_args = request.get("args", ["", ""])
878 record_id = request_args[0]
879 component_name = request_args[1]
880 urlprefix = "%s/%s/%s" % (request.function,
881 record_id,
882 component_name)
883
884 else:
885
886 urlprefix = request.function
887
888 redirect(URL(request.controller,
889 "%s/upload.pdf" % urlprefix,
890 args="review",
891 vars={"jobuuid":jobuuid}))
892
893 else:
894
895
896
897 auth = current.auth
898 if auth.user:
899 utc_offset = auth.user.utc_offset
900 else:
901 r.error(501, self.ERROR.NO_UTC_OFFSET)
902
903
904 current.s3db.ocr_meta
905
906
907 formuuid = r.vars.get("formuuid", None)
908 uploadformat = r.vars.get("uploadformat", None)
909 requesturl = request.env.path_info
910 createurl = "%s/create.pdf" %\
911 requesturl[0:requesturl.rfind("/")]
912 if not (formuuid and uploadformat):
913 availForms = self.__getResourceForms()
914 return response.render("_ocr_upload.html",
915 dict(availForms=availForms,
916 createurl=createurl))
917 else:
918 try:
919 numpages = self.__getNumPages(formuuid)
920 except:
921 r.error(501, current.ERROR.BAD_RECORD)
922
923 if not numpages:
924 r.error(501, self.ERROR.EMPTY_OCR_FORM)
925
926 return response.render("_ocr_page_upload.html",
927 dict(numpages=numpages,
928 posturl=createurl,
929 formuuid=formuuid,
930 uploadformat=uploadformat))
931
932 numpages = self.__getNumPages(formuuid)
933 if not numpages:
934 r.error(501, self.ERROR.EMPTY_OCR_FORM)
935
936 return response.render("_ocr_page_upload.html",
937 dict(numpages=numpages,
938 posturl=createurl,
939 formuuid=formuuid,
940 uploadformat=uploadformat))
941
942 else:
943 r.error(405, current.ERROR.BAD_METHOD)
944
945 elif r.http == "POST":
946 if method == "create":
947
948 if not current.deployment_settings.has_module("ocr"):
949 r.error(501, self.ERROR.OCR_DISABLED)
950
951
952 formuuid = r.vars.formuuid
953 numpages = int(r.vars.numpages)
954 uploadformat = r.vars.uploadformat
955
956
957 import uuid
958 setuuid = uuid.uuid1()
959
960
961 current.s3db.ocr_meta
962
963
964 if uploadformat == "image":
965
966 payloadtable = db.ocr_payload
967 for eachpage in xrange(1, numpages + 1):
968 varname = "page%s" % eachpage
969 fileholder = r.vars[varname]
970 pagenumber = eachpage
971
972
973 imgfilename = fileholder.filename
974 extension = lambda m: m[m.rfind(".") + 1:]
975 imageformats = ["jpg", "png", "gif", "bmp"]
976
977 if extension(imgfilename) not in imageformats:
978 r.error(501, self.ERROR.INVALID_IMAGE_TYPE)
979
980
981 payloadtable.insert(
982 image_set_uuid=setuuid,
983 image_file=payloadtable["image_file"].store(\
984 fileholder.file,
985 fileholder.filename),
986 page_number=pagenumber)
987
988 elif uploadformat == "pdf":
989 fileholder = r.vars["pdffile"]
990
991 filename = fileholder.filename
992 extension = lambda m: m[m.rfind(".") + 1:]
993
994 if extension(filename) != "pdf":
995 r.error(501, self.ERROR.NOT_PDF_FILE)
996
997
998 uniqueuuid = setuuid
999 inputfilename = "%s_%s" % (uniqueuuid, fileholder.filename)
1000 outputfilename = "%s_%s.png" % (uniqueuuid,
1001 fileholder.filename[:-4])
1002
1003 ocr_temp_dir = os.path.join(self.r.folder,
1004 "uploads", "ocr_temp")
1005 try:
1006 os.mkdir(ocr_temp_dir)
1007 except(OSError):
1008 pass
1009
1010 f = open(os.path.join(ocr_temp_dir, inputfilename), "w")
1011 f.write(fileholder.file.read())
1012 f.close()
1013
1014 success = subprocess.call(["convert",
1015 os.path.join(ocr_temp_dir,
1016 inputfilename),
1017 os.path.join(ocr_temp_dir,
1018 outputfilename)])
1019 if success != 0:
1020 self.r.error(501, self.ERROR.IMAGE_MAGICK_ERROR)
1021
1022
1023 payloadtable = db.ocr_payload
1024
1025 if numpages == 1:
1026 imagefilename = outputfilename
1027 imgfilepath = os.path.join(ocr_temp_dir, imagefilename)
1028 try:
1029 imgfile = open(imgfilepath)
1030 except(IOError):
1031 self.r.error(501, self.ERROR.INVALID_PDF)
1032 pagenumber = 1
1033
1034
1035 payloadtable.insert(
1036 image_set_uuid=setuuid,
1037 image_file=payloadtable["image_file"].store(\
1038 imgfile,
1039 imagefilename),
1040 page_number=pagenumber)
1041 imgfile.close()
1042 os.remove(imgfilepath)
1043
1044 else:
1045 for eachpage in xrange(0, numpages):
1046 imagefilename = "%s-%s.png" % (outputfilename[:-4],
1047 eachpage)
1048 imgfilepath = os.path.join(ocr_temp_dir,
1049 imagefilename)
1050 try:
1051 imgfile = open(imgfilepath, "r")
1052 except(IOError):
1053 self.r.error(501, self.ERROR.INVALID_PDF)
1054
1055 pagenumber = eachpage + 1
1056
1057
1058 payloadtable.insert(
1059 image_set_uuid=setuuid,
1060 image_file=payloadtable["image_file"].store(\
1061 imgfile,
1062 imagefilename),
1063 page_number=pagenumber)
1064 imgfile.close()
1065 os.remove(imgfilepath)
1066
1067 os.remove(os.path.join(ocr_temp_dir, inputfilename))
1068 try:
1069 os.rmdir(ocr_temp_dir)
1070 except(OSError):
1071 import shutil
1072 shutil.rmtree(ocr_temp_dir)
1073
1074 else:
1075 r.error(501, self.ERROR.INVALID_IMAGE_TYPE)
1076
1077
1078 s3ocrimageparser = S3OCRImageParser(self, r)
1079 output = s3ocrimageparser.parse(formuuid, setuuid)
1080
1081 table = db.ocr_data_xml
1082 table.insert(image_set_uuid=setuuid,
1083 data_file=table["data_file"].store(
1084 StringIO(output),
1085 "%s-data.xml" % setuuid),
1086 form_uuid=formuuid,
1087 )
1088
1089 if r.component:
1090 request_args = current.request.get("args", ["", ""])
1091 record_id = request_args[0]
1092 component_name = request_args[1]
1093 urlprefix = "%s/%s/%s" % (request.function,
1094 record_id,
1095 component_name)
1096
1097 else:
1098
1099 urlprefix = request.function
1100
1101 redirect(URL(request.controller,
1102 "%s/import.pdf" % urlprefix,
1103 args="import",
1104 vars={"setuuid":setuuid}))
1105
1106 elif method == "import":
1107 if not current.deployment_settings.has_module("ocr"):
1108 r.error(501, self.ERROR.OCR_DISABLED)
1109
1110 authorised = self._permitted(method="create")
1111 if not authorised:
1112 r.unauthorised()
1113
1114 try:
1115 if r.component:
1116 trigger = r.args[3]
1117 else:
1118 trigger = r.args[1]
1119 except(IndexError):
1120 trigger = None
1121
1122 if trigger == "review":
1123
1124 jobuuid = r.vars.pop("jobuuid")
1125
1126
1127 statustable = current.s3db.ocr_form_status
1128 query = (statustable.job_uuid == jobuuid)
1129 row = db(query).select(statustable.review_status,
1130 limitby=(0, 1)).first()
1131 if not row:
1132 r.error(501, self.ERROR.INVALID_JOBID)
1133
1134 if row.review_status == 1:
1135
1136 r.error(501, self.ERROR.JOB_COMPLETE)
1137
1138 try:
1139 r.vars.pop("_utc_offset")
1140 except:
1141 pass
1142
1143 try:
1144 ignore_fields = r.vars.pop("ignore-fields-list")
1145 except:
1146 ignore_fields = None
1147
1148 if not ignore_fields:
1149 ignore_fields = []
1150 else:
1151 try:
1152 ignore_fields = ignore_fields.split("|")
1153 except:
1154 ignore_fields = [ignore_fields]
1155
1156 datadict = Storage()
1157 for field in r.vars.keys():
1158 resourcetable, fieldname = field.split("-")
1159 if not datadict.has_key(resourcetable):
1160 datadict[resourcetable] = Storage()
1161
1162 datadict[resourcetable][fieldname] = r.vars[field]
1163
1164 for field in ignore_fields:
1165 resourcetable, fieldname = field.split("-")
1166 datadict[resourcetable].pop(fieldname)
1167 if len(datadict[resourcetable]) == 0:
1168 datadict.pop(resourcetable)
1169
1170 s3xml_etree_dict = Storage()
1171 for resource in datadict.keys():
1172 s3xml_root = etree.Element("s3xml")
1173 resource_element = etree.SubElement(s3xml_root, "resource")
1174 resource_element.attrib["name"] = resource
1175
1176 for field in datadict[resource].keys():
1177 fieldvalue = datadict[resource][field]
1178 fieldvalue = str(fieldvalue) if fieldvalue else ""
1179 fieldtype = db[resource][field].type
1180 if fieldtype.startswith("reference "):
1181 reference_resource_name = fieldtype[len("reference "):]
1182
1183 reference_element =\
1184 etree.SubElement(resource_element, "reference")
1185 reference_element.attrib["field"] = field
1186 reference_element.attrib["resource"] = reference_resource_name
1187
1188 ref_res_element =\
1189 etree.SubElement(reference_element, "resource")
1190 ref_res_element.attrib["name"] = reference_resource_name
1191
1192 ref_res_data_element =\
1193 etree.SubElement(ref_res_element, "data")
1194 ref_res_data_element.attrib["field"] = "name"
1195 try:
1196 ref_res_data_element.text = cast2ascii(fieldvalue)
1197 except(ValueError):
1198 ref_res_data_element.text = ""
1199 else:
1200 field_element = etree.SubElement(resource_element, "data")
1201 field_element.attrib["field"] = field
1202 try:
1203 field_element.attrib["value"] = cast2ascii(fieldvalue)
1204 except(ValueError):
1205 field_element.attrib["value"] = ""
1206 try:
1207 field_element.text = cast2ascii(fieldvalue)
1208 except(ValueError):
1209 field_element.text = ""
1210
1211 s3xml_etree_dict[resource] = s3xml_root
1212
1213 errordict = {}
1214
1215 _record = current.xml.record
1216 s3record_dict = Storage()
1217 for tablename in s3xml_etree_dict.keys():
1218 record = _record(db[tablename],
1219 s3xml_etree_dict[tablename].getchildren()[0])
1220 s3record_dict[tablename] = record
1221
1222 import_job = r.resource.import_tree(None, None, job_id=jobuuid,
1223 ignore_errors=False,
1224 commit_job=False)
1225
1226 response.headers["Content-Type"] = contenttype(".json")
1227
1228 for tablename in s3record_dict.keys():
1229 record = s3record_dict[tablename]
1230 possible_items = []
1231 our_item = None
1232 for eachitem in import_job.items.keys():
1233 item = import_job.items[eachitem]
1234 if item.table == tablename:
1235 if item.data and (len(item.data) > 0):
1236 our_item = item
1237 else:
1238 if item.data and (len(item.data) == 0):
1239 possible_items.append(item)
1240
1241 if our_item:
1242 our_item.update(record)
1243 elif len(possible_items) > 0:
1244 possible_items[0].update(record)
1245 else:
1246 import_job.add_item(s3xml_etree_dict[tablename].getchildren()[0])
1247
1248 for resourcename in datadict.keys():
1249 table = db[resourcename]
1250 for field in datadict[resourcename].keys():
1251 if not table[field].type.startswith("reference "):
1252 value, error = s3_validate(table,
1253 field,
1254 datadict[resourcename][field])
1255 if error:
1256 errordict["%s-%s" % (resourcename, field)] = str(error)
1257
1258 if not import_job.error_tree:
1259 store_success = import_job.store()
1260 if store_success:
1261 if import_job.error_tree:
1262 errordict = self.__parse_job_error_tree(import_job.error_tree)
1263 success = False
1264 else:
1265
1266 for resourcename in datadict.keys():
1267 table = db[resourcename]
1268 for field in datadict[resourcename].keys():
1269 if not table[field].type.startswith("reference "):
1270 value, error =\
1271 s3_validate(table,
1272 field,
1273 datadict[resourcename][field])
1274 if error:
1275 errordict["%s-%s" % (resourcename, field)] = str(error)
1276
1277 if len(errordict) > 0:
1278 success = False
1279 else:
1280 success = True
1281 import_job.commit()
1282
1283 else:
1284 errordict = self.__parse_job_error_tree(import_job.error_tree)
1285 success = False
1286 else:
1287 errordict = self.__parse_job_error_tree(import_job.error_tree)
1288 success = False
1289
1290 if success:
1291 session.confirmation =\
1292 T("OCR review data has been stored into the database successfully.")
1293
1294
1295 statustable = db["ocr_form_status"]
1296 query = (statustable.job_uuid == jobuuid)
1297 row = db(query).select(statustable.image_set_uuid).first()
1298 image_set_uuid = row.image_set_uuid
1299
1300
1301 db(query).update(review_status=1)
1302
1303
1304 cropstable = db.ocr_field_crops
1305 query = (cropstable.image_set_uuid == image_set_uuid)
1306
1307
1308 rows = db(query).select(cropstable.image_file)
1309 for row in rows:
1310 filename = row.image_file
1311 filepath = os.path.join(self.r.folder,
1312 "uploads",
1313 "ocr_payload",
1314 filename)
1315 os.remove(filepath)
1316
1317
1318 db(query).delete()
1319
1320 return json.dumps({"success": success,
1321 "error": errordict})
1322
1323 else:
1324 r.error(405, current.ERROR.BAD_METHOD)
1325
1326 else:
1327 r.error(501, current.ERROR.BAD_REQUEST)
1328
1329
1331 """
1332 create a dictionary of fields with errors
1333
1334 @param tree: S3ImportJob.error_tree
1335 @return: errordict
1336 """
1337
1338 errordict = {}
1339
1340 for resource in tree:
1341 resourcename = resource.attrib.get("name")
1342 for field in resource:
1343 fieldname = field.attrib.get("field")
1344 error = field.attrib.get("error")
1345 if error:
1346 errordict["%s-%s" % (resourcename, fieldname)] = error
1347
1348 return errordict
1349
1350
1352 """
1353 Calculate the Damerau-Levenshtein distance between sequences.
1354
1355 This distance is the number of additions, deletions, substitutions,
1356 and transpositions needed to transform the first sequence into the
1357 second. Although generally used with strings, any sequences of
1358 comparable objects will work.
1359
1360 Transpositions are exchanges of *consecutive* characters; all other
1361 operations are self-explanatory.
1362
1363 This implementation is O(N*M) time and O(M) space, for N and M the
1364 lengths of the two sequences.
1365
1366 >>> dameraulevenshtein('ba', 'abc')
1367 2
1368 >>> dameraulevenshtein('fee', 'deed')
1369 2
1370
1371 It works with arbitrary sequences too:
1372 >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
1373 2
1374 """
1375
1376
1377
1378
1379
1380 oneago = None
1381 thisrow = range(1, len(seq2) + 1) + [0]
1382 for x in xrange(len(seq1)):
1383
1384
1385
1386 twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
1387 for y in xrange(len(seq2)):
1388 delcost = oneago[y] + 1
1389 addcost = thisrow[y - 1] + 1
1390 subcost = oneago[y - 1] + (seq1[x] != seq2[y])
1391 thisrow[y] = min(delcost, addcost, subcost)
1392
1393 if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
1394 and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
1395 thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
1396 return thisrow[len(seq2) - 1]
1397
1398
1400 """
1401 convert data generated from ocr parser to a dictionary
1402
1403 @param s3dataxml: output of S3OCRImageParser
1404
1405 @return: python dictionary equalant to the input xml
1406 """
1407
1408 s3ocrdataxml_etree = etree.fromstring(s3ocrdataxml)
1409 s3ocrdatadict = Storage()
1410
1411 s3xml_root = s3ocrdataxml_etree
1412 resource_element = s3xml_root.getchildren()[0]
1413 s3ocr_root = etree.Element("s3ocr")
1414
1415 if self.r.component:
1416 s3ocr_root.append(resource_element)
1417
1418 else:
1419 componentetrees = []
1420
1421 mres = etree.Element("resource")
1422 for attr in resource_element.attrib.keys():
1423 mres.set(attr, resource_element.attrib.get(attr))
1424 for field_element in resource_element:
1425 if field_element.tag in ["data", "reference"]:
1426 mres.append(field_element)
1427 elif field_element.tag == "resource":
1428 componentetrees.append(field_element)
1429
1430 serialised_component_etrees = componentetrees
1431
1432
1433 s3ocr_root.append(mres)
1434 for res in serialised_component_etrees:
1435 s3ocr_root.append(res)
1436
1437 for resource in s3ocr_root:
1438 resourcename = resource.attrib.get("name")
1439 s3ocrdatadict[resourcename] = Storage()
1440 for field in resource:
1441 if field.tag == "reference":
1442 fieldname = field.attrib.get("field")
1443 ref_res_field = field.getchildren()[0]
1444 datafield = ref_res_field.getchildren()[0]
1445 value = datafield.text
1446
1447 else:
1448 fieldname = field.attrib.get("field")
1449 value = field.attrib.get("value")
1450 text = field.text
1451 if not value:
1452 value = text
1453
1454 s3ocrdatadict[resourcename][fieldname] = value
1455 return s3ocrdatadict
1456
1457
1459 """
1460 convert data from import job into a dictionary
1461
1462 @param importjob: S3ImportJob instance
1463
1464 @return: data of S3ImportJob into a dictionary
1465 """
1466
1467 s3ocrdata = Storage()
1468
1469 import_item_dict = importjob.items
1470 for eachitem in import_item_dict.keys():
1471 import_item = import_item_dict[eachitem]
1472 if import_item.data and len(import_item.data) > 0:
1473 s3ocrdata[str(import_item.table)] = import_item.data
1474
1475 return s3ocrdata
1476
1477
1853
1854
1856 """
1857 convert s3ocrxml to dictionary so that it can be used in templates
1858
1859 @param s3ocrxml: content of a s3ocrxml file, in text
1860
1861 @return: equivalent dictionary for s3ocrxml file
1862 """
1863
1864 db = current.db
1865 s3ocr_etree = etree.fromstring(s3ocrxml)
1866 s3ocrdict = Storage()
1867 resource_seq = []
1868
1869 for resource in s3ocr_etree:
1870 resourcename = resource.attrib.get("name")
1871 table = db[resourcename]
1872 s3ocrdict[resourcename] = Storage()
1873 resource_seq.append(resourcename)
1874 field_seq = []
1875 for field in resource:
1876 get = field.attrib.get
1877 fieldname = get("name")
1878
1879 if get("readable") == "True" and \
1880 get("writable") == "True":
1881
1882 field_seq.append(fieldname)
1883
1884 fieldlabel = get("label")
1885 fieldtype = get("type")
1886 numlines = get("lines", "1")
1887
1888 if get("reference") == "1":
1889 fieldreference = True
1890 else:
1891 fieldreference = False
1892 fieldresource = get("resource")
1893 if get("has_options") == "True":
1894 fieldhasoptions = True
1895 else:
1896 fieldhasoptions = False
1897
1898
1899 fieldcomment = table[fieldname].comment
1900
1901 if fieldhasoptions:
1902 try:
1903 s3ocrselect = field.getchildren()[0]
1904 options_found = True
1905 except(IndexError):
1906 fieldoptions = None
1907 options_found = False
1908
1909 if options_found:
1910
1911 numoptions = len(s3ocrselect.getchildren())
1912 optionlist = []
1913
1914 for option in s3ocrselect:
1915 optionlabel = option.text
1916 optionvalue = option.attrib.get("value")
1917 optionlist.append(Storage({"label": optionlabel,
1918 "value": optionvalue}))
1919
1920 fieldoptions = Storage({"count": numoptions,
1921 "list": optionlist})
1922
1923 else:
1924 fieldoptions = None
1925 else:
1926 fieldoptions = None
1927
1928 s3ocrdict[resourcename][fieldname] = Storage({"label": fieldlabel,
1929 "type": fieldtype,
1930 "comment": fieldcomment,
1931 "reference": fieldreference,
1932 "resource": fieldresource,
1933 "has_options": fieldhasoptions,
1934 "options": fieldoptions,
1935 "lines": int(numlines)
1936 })
1937 s3ocrdict[resourcename]["$field_seq"] = field_seq
1938
1939 s3ocrdict["$resource_seq"] = resource_seq
1940
1941 return s3ocrdict
1942
1943
1944 - def newDocument(self,
1945 title,
1946 header,
1947 footer,
1948 filename = None,
1949 heading=None,
1950 ):
1951 """
1952 This will create a new empty PDF document.
1953 Data then needs to be added to this document.
1954
1955 @param title: The title that will appear at the top of the document
1956 and in the filename
1957
1958 @return: An empty pdf document
1959 """
1960
1961
1962 now = self.request.now.isoformat()[:19].replace("T", " ")
1963 docTitle = "%s %s" % (title, now)
1964 if filename == None:
1965 self.filename = "%s_%s.pdf" % (title, now)
1966 else:
1967 self.filename = "%s_%s.pdf" % (filename, now)
1968 self.output = StringIO()
1969 self.doc = EdenDocTemplate(self.output, title=docTitle)
1970 self.doc.setPageTemplates(header,footer)
1971 self.content = []
1972 if heading == None:
1973 heading = title
1974 self.title = heading
1975 self.prevtitle = heading
1976 self.setPortrait()
1977 self.leftMargin = 0.4 * inch
1978 self.rightMargin = 0.4 * inch
1979 self.topMargin = 0.4 * inch
1980 self.bottomMargin = 0.4 * inch
1981 self.MINIMUM_MARGIN_SIZE = 0.3 * inch
1982 self.setMargins()
1983
1984
2010
2011
2035
2036
2037 - def __getNumPages(self, formuuid):
2038 """
2039 Gets Number of pages for given form UUID
2040
2041 @param formuuid: uuid of the form, for which
2042 number of pages is required
2043
2044 @return: number of pages in a form identified
2045 by uuid
2046 """
2047
2048 db = current.db
2049 table = db.ocr_meta
2050 row = db(table.form_uuid == formuuid).select(table.pages,
2051 limitby=(0, 1)
2052 ).first()
2053 return int(row.pages)
2054
2055
2057 """
2058 Optimise & Modifiy s3xml etree to and produce s3ocr etree
2059
2060 @return: s3ocr etree
2061 """
2062
2063 r = self.r
2064
2065 s3xml_etree = self.resource.export_struct(options=True,
2066 references=True,
2067 stylesheet=None,
2068 as_json=False,
2069 as_tree=True)
2070
2071
2072 ITEXT = "label"
2073 HINT = "comment"
2074 TYPE = "type"
2075 HASOPTIONS = "has_options"
2076 LINES = "lines"
2077 BOXES = "boxes"
2078 REFERENCE = "reference"
2079 RESOURCE = "resource"
2080
2081
2082
2083 s3xml_root = s3xml_etree.getroot()
2084 resource_element = s3xml_root.getchildren()[0]
2085 s3ocr_root = etree.Element("s3ocr")
2086
2087
2088 settings = current.deployment_settings
2089 self.exclude_component_list =\
2090 settings.get_pdf_excluded_fields("%s_%s" % \
2091 (r.prefix,
2092 r.resource.name))
2093
2094 if r.component:
2095 s3ocr_root.append(resource_element)
2096
2097 else:
2098 componentetrees = []
2099
2100 mres = etree.Element("resource")
2101 for attr in resource_element.attrib.keys():
2102 mres.set(attr, resource_element.attrib.get(attr))
2103 for field_element in resource_element:
2104 if field_element.tag == "field":
2105 mres.append(field_element)
2106 elif field_element.tag == "resource":
2107 componentetrees.append(field_element)
2108
2109 serialised_component_etrees = componentetrees
2110
2111
2112 s3ocr_root.append(mres)
2113 for res in serialised_component_etrees:
2114 s3ocr_root.append(res)
2115
2116
2117 self.generic_ocr_field_type = {
2118 "string": "string",
2119 "text": "textbox",
2120 "boolean" : "boolean",
2121 "double": "double",
2122 "date": "date",
2123 "datetime": "datetime",
2124 "integer": "integer",
2125 "list:integer": "multiselect",
2126 "list:string": "multiselect",
2127 "list:double": "multiselect",
2128 "list:text": "multiselect",
2129 }
2130
2131
2132
2133 FIELD_TYPE_LINES = {
2134 "string": 1,
2135 "textbox": 2,
2136 "integer": 1,
2137 "double": 1,
2138 "date": 1,
2139 "datetime": 1,
2140 }
2141 FIELD_TYPE_BOXES = {
2142 "integer": 8,
2143 "double": 16,
2144 }
2145 for resource in s3ocr_root.iterchildren():
2146 rget = resource.attrib.get
2147 resourcetablename = rget("name")
2148
2149
2150 if not r.component:
2151 if rget("name") in self.exclude_component_list:
2152 s3ocr_root.remove(resource)
2153 continue
2154
2155 if "alias" in resource.attrib:
2156 alias = resource.attrib["alias"]
2157 elif "_" in resourcetablename:
2158 alias = resourcetablename.split("_", 1)[1]
2159 else:
2160 alias = resourcetablename
2161
2162 if alias == self.resource.alias and \
2163 resourcetablename == self.resource.tablename:
2164 fieldresource = self.resource
2165 elif alias in self.resource.components:
2166 fieldresource = self.resource.components[alias]
2167 else:
2168 continue
2169
2170 for field in resource.iterchildren():
2171 get = field.attrib.get
2172 set = field.set
2173 fieldname = get("name")
2174
2175 fieldtype = get(TYPE)
2176
2177 if fieldtype.startswith("reference "):
2178 set(RESOURCE, fieldtype.split("reference ")[1])
2179 set(REFERENCE, "1")
2180 else:
2181 set(REFERENCE, "0")
2182
2183
2184 ocrfieldtype = self.generic_ocr_field_type.get(fieldtype, None)
2185 if ocrfieldtype != None:
2186 set(TYPE, ocrfieldtype)
2187
2188 fieldtype = get(TYPE)
2189
2190
2191 fieldhasoptions = get(HASOPTIONS)
2192 if fieldhasoptions == "False":
2193 set(LINES, str(FIELD_TYPE_LINES.get(fieldtype, 1)))
2194 if fieldtype in FIELD_TYPE_BOXES.keys():
2195 set(BOXES, str(FIELD_TYPE_BOXES.get(fieldtype)))
2196
2197
2198 if get("readable", "False") == "True" and \
2199 get("writable", "False") == "False":
2200
2201 fieldname = get("name")
2202 try:
2203 fielddefault = fieldresource.table[fieldname].default
2204 except(KeyError):
2205 fielddefault = "None"
2206 set("default", str(fielddefault))
2207
2208
2209 if fieldtype not in self.generic_ocr_field_type.values():
2210 set(TYPE, "string")
2211 set(HASOPTIONS, "False")
2212 set(LINES, "2")
2213
2214 fieldtype = get(TYPE)
2215
2216
2217 if fieldtype == "boolean":
2218 set(HASOPTIONS, "True")
2219
2220
2221 if get("readable", "False") == "False" and \
2222 get("writable", "False") == "False":
2223 resource.remove(field)
2224 continue
2225
2226 if get(HASOPTIONS, "False") == "True" and \
2227 get(TYPE) != "boolean":
2228 s3ocrselect = field.getchildren()[0]
2229 for option in s3ocrselect.iterchildren():
2230 if option.text == "" or option.text == None:
2231 s3ocrselect.remove(option)
2232 continue
2233
2234 return s3ocr_root
2235
2236
2238 """
2239 Produces OCR Compatible PDF forms
2240 """
2241
2242 T = current.T
2243 s3ocr_root = self.__s3OCREtree()
2244 self.s3ocrxml = etree.tostring(s3ocr_root, pretty_print=DEBUG)
2245 self.content = []
2246 s3ocr_layout_etree = self.layoutEtree
2247
2248
2249
2250
2251
2252
2253
2254 ITEXT = "label"
2255 HINT = "comment"
2256 TYPE = "type"
2257 HASOPTIONS = "has_options"
2258 LINES = "lines"
2259 BOXES = "boxes"
2260 REFERENCE = "reference"
2261 RESOURCE = "resource"
2262
2263 dtformat = current.deployment_settings.get_L10n_datetime_format()
2264 if str(dtformat)[:2] == "%m":
2265
2266 date_hint = T("fill in order: month(2) day(2) year(4)")
2267 datetime_hint = T("fill in order: hour(2) min(2) month(2) day(2) year(4)")
2268 else:
2269
2270 date_hint = T("fill in order: day(2) month(2) year(4)")
2271 datetime_hint = T("fill in order: hour(2) min(2) day(2) month(2) year(4)")
2272 l10n = {
2273 "datetime_hint": {
2274 "date": date_hint,
2275 "datetime": datetime_hint,
2276 },
2277 "boolean": {
2278 "yes": T("Yes"),
2279 "no": T("No"),
2280 },
2281 "select": {
2282 "multiselect": T("Select one or more option(s) that apply"),
2283 "singleselect": T("Select the option that applies"),
2284 },
2285 }
2286
2287
2288 append = self.content.append
2289 SubElement = etree.SubElement
2290 for resource in s3ocr_root:
2291 name = resource.attrib.get("name")
2292
2293 s3ocr_layout_resource_etree = SubElement(s3ocr_layout_etree,
2294 "resource",
2295 name=name)
2296
2297 styleSheet = getStyleSheet()
2298
2299
2300
2301
2302
2303
2304
2305 for field in resource.iterchildren():
2306 get = field.attrib.get
2307
2308 s3ocr_layout_field_etree = SubElement(s3ocr_layout_resource_etree,
2309 "field",
2310 name=get("name"),
2311 type=get("type"))
2312
2313 if get(REFERENCE) == "1":
2314 s3ocr_layout_field_etree.set(REFERENCE, "1")
2315 s3ocr_layout_field_etree.set(RESOURCE, get(RESOURCE))
2316
2317 fieldlabel = get(ITEXT)
2318 spacing = " " * 5
2319 fieldhint = self.__trim(get(HINT))
2320
2321 if fieldhint:
2322 append(Paragraph(html_unescape_and_strip("%s%s( %s )" % \
2323 (fieldlabel,
2324 spacing,
2325 fieldhint)),
2326 styleSheet["Question"]))
2327
2328 else:
2329 append(Paragraph(html_unescape_and_strip(fieldlabel),
2330 styleSheet["Question"]))
2331
2332 if get("readable", "False") == "True" and \
2333 get("writable", "False") == "False":
2334 append(Paragraph(html_unescape_and_strip(get("default",
2335 "No default Value")),
2336 styleSheet["DefaultAnswer"]))
2337
2338
2339 s3ocr_layout_resource_etree.remove(s3ocr_layout_field_etree)
2340
2341 elif get(HASOPTIONS) == "True":
2342 fieldtype = get(TYPE)
2343
2344 if fieldtype == "boolean":
2345 bool_text = l10n.get("boolean")
2346 append(DrawOptionBoxes(s3ocr_layout_field_etree,
2347 [bool_text.get("yes").decode("utf-8"),
2348 bool_text.get("no").decode("utf-8")],
2349 ["yes", "no"]))
2350
2351 else:
2352 if fieldtype == "multiselect":
2353 option_hint = l10n.get("select").get("multiselect")
2354 else:
2355
2356 option_hint = None
2357
2358 s3ocrselect = field.getchildren()[0]
2359 numoptions = len(s3ocrselect.getchildren())
2360
2361 if numoptions <= MAX_FORM_OPTIONS_LIMIT:
2362 s3ocr_layout_field_etree.attrib["limitcrossed"] = "1"
2363 if option_hint:
2364 append(DrawHintBox(option_hint.decode("utf-8")))
2365
2366 options = s3ocrselect.iterchildren()
2367
2368 opts = []
2369 oppend = opts.append
2370 range = int(math.ceil(numoptions / 4.0))
2371 for row in xrange(range):
2372 labels = []
2373 lappend = labels.append
2374 values = []
2375 vappend = values.append
2376 i = 1
2377 for option in options:
2378 label = option.text
2379 if label in opts:
2380 continue
2381 oppend(label)
2382 lappend(label)
2383 vappend(option.attrib.get("value"))
2384 if i == 4:
2385 break
2386 i += 1
2387 append(DrawOptionBoxes(s3ocr_layout_field_etree,
2388 labels,
2389 values))
2390 else:
2391 append(DrawHintBox(T("Enter a value carefully without spelling mistakes, this field needs to match existing data.").decode("utf-8")))
2392 for line in xrange(2):
2393 append(StringInputBoxes(numBoxes=None,
2394 etreeElem=s3ocr_layout_field_etree))
2395 else:
2396
2397 fieldtype = get(TYPE)
2398 BOXES_TYPES = ["string", "textbox", "integer",
2399 "double", "date", "datetime",]
2400 if fieldtype in BOXES_TYPES:
2401 if fieldtype in ["string", "textbox"]:
2402
2403 num_lines = int(get("lines", 1))
2404 for line in xrange(num_lines):
2405 append(StringInputBoxes(numBoxes=None,
2406 etreeElem=s3ocr_layout_field_etree))
2407
2408 elif fieldtype in ["integer", "double"]:
2409 num_boxes = int(get("boxes", 9))
2410 append(StringInputBoxes(numBoxes=num_boxes,
2411 etreeElem=s3ocr_layout_field_etree))
2412
2413 elif fieldtype in ["date", "datetime"]:
2414
2415
2416
2417
2418
2419 if fieldtype == "datetime":
2420 append(DateTimeBoxes(s3ocr_layout_field_etree))
2421 elif fieldtype == "date":
2422 append(DateBoxes(s3ocr_layout_field_etree))
2423
2424 else:
2425 self.r.error(501, current.ERROR.PARSE_ERROR)
2426 return
2427
2428
2430 """
2431 return layout file
2432
2433 @return: layout xml for the generated OCR form
2434 """
2435
2436 prettyprint = True if DEBUG else False
2437 return etree.tostring(self.layoutEtree, pretty_print=prettyprint)
2438
2439
2440 @staticmethod
2442 """
2443 Helper to trim off any enclosing paranthesis
2444
2445 @param text: text which need to be trimmed
2446
2447 @return: text with front and rear paranthesis stripped
2448 """
2449
2450 if isinstance(text, str) and \
2451 text[0] == "(" and \
2452 text[-1] == ")":
2453 text = text[1:-1]
2454 return text
2455
2456
2483
2484
2485 @staticmethod
2487 """
2488 Books a revision number for current operation in ocr_meta
2489
2490 @param formUUID: uuid of the generated form
2491 @param formResourceName: name of the eden resource
2492 """
2493
2494 db = current.db
2495 table = current.s3db.ocr_meta
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505 import uuid
2506 revision = uuid.uuid5(formUUID, formResourceName).hex.upper()[:6]
2507
2508 table.insert(form_uuid=formUUID,
2509 resource_name=formResourceName,
2510 revision=revision)
2511
2512 return revision
2513
2514
2515 @staticmethod
2517 """
2518 Method to extract a generic title from the resource using the
2519 crud strings
2520
2521 @param: resource: a S3Resource object
2522
2523 @return: the title as a String
2524 """
2525
2526 try:
2527 return current.response.s3.crud_strings.get(resource.table._tablename).get("title_list")
2528 except:
2529
2530 return current.T(resource.name.replace("_", " ")).decode("utf-8")
2531
2532
2533 - def setMargins(self, left=None, right=None, top=None, bottom=None):
2534 """
2535 Method to set the margins of the document
2536
2537 @param left: the size of the left margin, default None
2538 @param right: the size of the right margin, default None
2539 @param top: the size of the top margin, default None
2540 @param bottom: the size of the bottom margin, default None
2541
2542 The margin is only changed if a value is provided, otherwise the
2543 last value that was set will be used. The original values are set
2544 up to be an inch - in newDocument()
2545
2546 @todo: make this for a page rather than the document
2547 """
2548
2549 if left != None:
2550 self.doc.leftMargin = left
2551 self.leftMargin = left
2552 else:
2553 self.doc.leftMargin = self.leftMargin
2554 if right != None:
2555 self.doc.rightMargin = right
2556 self.rightMargin = right
2557 else:
2558 self.doc.rightMargin = self.rightMargin
2559 if top != None:
2560 self.doc.topMargin = top
2561 self.topMargin = top
2562 else:
2563 self.doc.topMargin = self.topMargin
2564 if bottom != None:
2565 self.doc.bottomMargin = bottom
2566 self.bottomMargin = bottom
2567 else:
2568 self.doc.bottomMargin = self.bottomMargin
2569
2570
2572 """
2573 Method to set the orientation of the document to be portrait
2574
2575 @todo: make this for a page rather than the document
2576 """
2577
2578 self.doc.pagesize = portrait(self.paper_size)
2579
2580
2582 """
2583 Method to set the orientation of the document to be landscape
2584
2585 @todo: make this for a page rather than the document
2586 """
2587
2588 self.doc.pagesize = landscape(self.paper_size)
2589
2590
2591 - def addTable(self,
2592 resource = None,
2593 raw_data = None,
2594 list_fields=None,
2595 report_groupby=None,
2596 report_hide_comments=False
2597 ):
2598 """
2599 Method to create a table that will be inserted into the document
2600
2601 @param resource: A S3Resource object
2602 @param list_Fields: A list of field names
2603 @param report_groupby: A field name that is to be used as a sub-group
2604 All the records that share the same report_groupby value will
2605 be clustered together
2606 @param report_hide_comments: Any comment field will be hidden
2607
2608 This uses the class S3PDFTable to build and properly format the table.
2609 The table is then built and stored in the document flow ready for
2610 generating the pdf.
2611
2612 If the table is too wide for the page then it will automatically
2613 adjust the margin, font or page orientation. If it is still too
2614 wide then the table will be split across multiple pages.
2615 """
2616
2617 table = S3PDFTable(document=self,
2618 resource=resource,
2619 raw_data=raw_data,
2620 list_fields=list_fields,
2621 groupby=report_groupby,
2622 hide_comments=report_hide_comments
2623 )
2624 result = table.build()
2625 if result != None:
2626 self.content += result
2627
2628
2632 """
2633 Method to convert the HTML generated for a rHeader into PDF
2634 """
2635
2636
2637 try:
2638
2639 repr = self.r.representation
2640 self.r.representation = "html"
2641 html = rHeader(self.r)
2642 self.r.representation = repr
2643 except:
2644
2645 html = rHeader
2646 parser = S3html2pdf(pageWidth = self.doc.width,
2647 exclude_class_list=["tabs"])
2648 result = parser.parse(html)
2649 if result != None:
2650 self.content += result
2651
2652
2659 """
2660 Method to create a rHeader table that is inserted into the document
2661
2662 @param resource: A S3Resource object
2663 @param list_Fields: A list of field names
2664 @param report_hide_comments: Any comment field will be hidden
2665
2666 This uses the class S3PDFTable to build and properly format the table.
2667 The table is then built and stored in the document flow ready for
2668 generating the pdf.
2669 """
2670
2671 rHeader = S3PDFRHeader(self,
2672 resource,
2673 raw_data,
2674 list_fields,
2675 report_hide_comments
2676 )
2677 result = rHeader.build()
2678 if result != None:
2679 self.content += result
2680
2681
2690
2691
2693 """
2694 Method to create a paragraph that may be inserted into the document
2695
2696 @param text: The text for the paragraph
2697 @param append: If True then the paragraph will be stored in the
2698 document flow ready for generating the pdf.
2699
2700 @return The paragraph
2701
2702 This method can return the paragraph rather than inserting into the
2703 document. This is useful if the paragraph needs to be first
2704 inserted in another flowable, before being added to the document.
2705 An example of when this is useful is when large amounts of text
2706 (such as a comment) are added to a cell of a table.
2707 """
2708
2709 if text != "":
2710 if style == None:
2711 styleSheet = getSampleStyleSheet()
2712 style = styleSheet["Normal"]
2713 para = Paragraph(text, style)
2714 if append:
2715 self.content.append(para)
2716 return para
2717 return ""
2718
2719
2721 """
2722 Add a spacer to the story
2723 """
2724
2725 spacer = Spacer(1, height)
2726 if append:
2727 self.content.append(spacer)
2728 return spacer
2729
2730
2732 """
2733 Add an overlay to the page
2734 """
2735
2736 self.content.append(Overlay(callback, data))
2737
2738
2740 """
2741 Add square text boxes for text entry to the story
2742 """
2743
2744 boxes = StringInputBoxes(cnt, etree.Element("dummy"))
2745 if append:
2746 self.content.append(boxes)
2747 return boxes
2748
2749
2750 - def throwPageBreak(self):
2751 """
2752 Method to force a page break in the report
2753 """
2754
2755 self.content.append(PageBreak())
2756
2757
2758 - def changePageTitle(self, newTitle):
2759 """
2760 Method to force a page break in the report
2761 """
2762
2763 self.content.append(ChangePageTitle(self, newTitle))
2764
2765
2766 - def getStyledTable(self, table, colWidths=None, rowHeights = None, style=[]):
2767 """
2768 Method to create a simple table
2769 """
2770
2771 (list, style) = self.addCellStyling(table, style)
2772 return Table(list,
2773 colWidths=colWidths,
2774 rowHeights=rowHeights,
2775 style=style,
2776 )
2777
2778
2780 """
2781 Method to calculate the dimensions of the table
2782 """
2783
2784 tempDoc = EdenDocTemplate(StringIO())
2785 tempDoc.setPageTemplates(lambda x, y: None, lambda x, y: None)
2786 tempDoc.pagesize = portrait(self.paper_size)
2787 tempDoc.build([tempTable], canvasmaker=canvas.Canvas)
2788 return (tempTable._colWidths, tempTable._rowHeights)
2789
2790
2792 """
2793 Add special styles to the text in a cell
2794 """
2795
2796 if style == "*GREY":
2797 return [("TEXTCOLOR", cell, cell, colors.lightgrey)]
2798 elif style == "*RED":
2799 return [("TEXTCOLOR", cell, cell, colors.red)]
2800 return []
2801
2802
2804 """
2805 Add special styles to the text in a table
2806 """
2807
2808 row = 0
2809 for line in table:
2810 col = 0
2811 for cell in line:
2812 try:
2813 if cell.startswith("*"):
2814 (instruction,sep,text) = cell.partition(" ")
2815 style += self.cellStyle(instruction, (col, row))
2816 table[row][col] = text
2817 except:
2818 pass
2819 col += 1
2820 row += 1
2821 return (table, style)
2822
2823
2825 """
2826 Method to add a banner to a page
2827 used by pageHeader
2828 """
2829
2830 self.headerBanner = os.path.join(current.request.folder,image)
2831
2832
2834 """
2835 Method to generate the basic look of a page.
2836 It is a callback method and will not be called directly
2837 """
2838
2839 canvas.saveState()
2840 if self.logo and os.path.exists(self.logo):
2841 im = Image.open(self.logo)
2842 (iwidth, iheight) = im.size
2843 height = 1.0 * inch
2844 width = iwidth * (height/iheight)
2845 canvas.drawImage(self.logo,
2846 inch,
2847 doc.pagesize[1] - 1.2 * inch,
2848 width = width,
2849 height = height)
2850 if self.headerBanner and os.path.exists(self.headerBanner):
2851 im = Image.open(self.headerBanner)
2852 (iwidth, iheight) = im.size
2853 height = 0.75 * inch
2854 width = iwidth * (height / iheight)
2855 canvas.drawImage(self.headerBanner,
2856 3 * inch,
2857 doc.pagesize[1] - 0.95 * inch,
2858 width = width,
2859 height = height)
2860 canvas.setFont("Helvetica-Bold", 14)
2861 canvas.drawCentredString(doc.pagesize[0] / 2.0,
2862 doc.pagesize[1] - 1.3*inch, self.title
2863 )
2864 canvas.setFont("Helvetica-Bold", 8)
2865 now = S3DateTime.datetime_represent(datetime.utcnow(), utc=True)
2866 canvas.drawCentredString(doc.pagesize[0] - 1.5 * inch,
2867 doc.pagesize[1] - 1.3 * inch, now
2868 )
2869 canvas.restoreState()
2870
2871
2873 """
2874 Method to generate the basic look of a page.
2875 It is a callback method and will not be called directly
2876 """
2877
2878 canvas.saveState()
2879 canvas.setFont("Helvetica", 7)
2880 canvas.drawString(inch, 0.75 * inch,
2881 "Page %d %s" % (doc.page,
2882 self.prevtitle
2883 )
2884 )
2885 self.prevtitle = self.title
2886 canvas.restoreState()
2887
2888
2890 """
2891 Method to build the PDF document.
2892 The response headers are set up for a pdf document and the document
2893 is then sent
2894
2895 @return the document as a stream of characters
2896
2897 @todo add a proper template class so that the doc.build is more generic
2898 """
2899
2900 styleSheet = getSampleStyleSheet()
2901 self.doc.build(self.content,
2902 canvasmaker=canvas.Canvas)
2903 self.output.seek(0)
2904 return self.output.read()
2905
2906
2907
2908 if reportLabImported:
2909
2910
3056
3059 """
3060 Class to get the labels and the data from the database
3061 """
3062
3064 """
3065 Method to create the S3PDFDataSource object
3066 """
3067
3068 self.resource = obj.resource
3069 self.list_fields = obj.list_fields
3070 self.report_groupby = obj.report_groupby
3071 self.hideComments = obj.hideComments
3072 self.fields = None
3073 self.labels = None
3074 self.records = False
3075
3076
3131
3132
3133
3134
3135
3136
3138 """
3139 Internally used method to get the field labels
3140
3141 Used to remove the report_groupby label (if present)
3142 """
3143
3144
3145 labels = self.labels
3146 if self.report_groupby != None:
3147 for label in labels:
3148 if label == self.report_groupby.label:
3149 labels.remove(label)
3150 return labels
3151
3152
3154 """
3155 Internally used method to format the data from the database
3156
3157 This will extract the data from the returned records list.
3158
3159 If there is a groupby then the records will be grouped by this field.
3160 For each new value the groupby field will be placed in a list of
3161 its own. This will then be followed by lists of the records that
3162 share this value
3163
3164 If there is no groupby then the result is a simple matrix of
3165 rows by fields
3166 """
3167
3168
3169 data = []
3170 currentGroup = None
3171 subheadingList = []
3172 rowNumber = 1
3173 for item in self.records:
3174 row = []
3175 if self.report_groupby != None:
3176
3177
3178 groupData = s3_represent_value(self.report_groupby,
3179 record=item,
3180 strip_markup=True,
3181 non_xml_output=True
3182 )
3183 if groupData != currentGroup:
3184 currentGroup = groupData
3185 data.append([groupData])
3186 subheadingList.append(rowNumber)
3187 rowNumber += 1
3188
3189 for field in self.fields:
3190 if self.report_groupby != None:
3191 if field.label == self.report_groupby.label:
3192 continue
3193 if field.field:
3194 text = s3_represent_value(field.field,
3195 record=item,
3196 strip_markup=True,
3197 non_xml_output=True,
3198 extended_comments=True
3199 )
3200 if text == "" or not field.field:
3201
3202
3203 tname = field.tname
3204 fname = field.fname
3205 if fname in item:
3206 text = item[fname]
3207 elif tname in item and fname in item[tname]:
3208 text = item[tname][fname]
3209 else:
3210 text = ""
3211 row.append(text)
3212 data.append(row)
3213 rowNumber += 1
3214 return (subheadingList, data)
3215
3218 """
3219 Class to build a simple table that holds the details of one record,
3220 which can then be placed in a pdf document
3221
3222 This class doesn't need to be called directly.
3223 Rather see S3PDF.addrHeader()
3224 """
3225
3233 """
3234 Method to create an rHeader object
3235
3236 @param document: An S3PDF object
3237 @param resource: An S3Resource object
3238 @param list_fields: A list of field names
3239 @param hide_comments: Any comment field will be hidden
3240 """
3241
3242 self.pdf = document
3243 self.resource = resource
3244 self.raw_data = raw_data
3245 self.list_fields = list_fields
3246 self.hideComments = hide_comments
3247 self.report_groupby = None
3248 self.data = []
3249 self.subheadingList = []
3250 self.labels = []
3251 self.fontsize = 10
3252
3253
3255 """
3256 Method to build the table.
3257
3258 @return: A list of Table objects. Normally this will be a list with
3259 just one table object, but if the table needs to be split
3260 across columns then one object per page will be created.
3261 """
3262
3263 if self.resource != None:
3264 ds = S3PDFDataSource(self)
3265
3266 ds.select()
3267 self.labels = ds.getLabels()
3268 self.data.append(self.labels)
3269 (self.subheadingList, data) = ds.getData()
3270 self.data + data
3271
3272 if self.raw_data != None:
3273 self.data = self.raw_data
3274
3275 self.rheader = []
3276 if len(self.data) == 0:
3277 return None
3278 else:
3279 NONE = current.messages["NONE"]
3280 for index in range(len(self.labels)):
3281 try:
3282 value = data[0][index]
3283 except:
3284 value = NONE
3285 self.rheader.append([self.labels[index],
3286 value])
3287 content = []
3288 style = [("FONTSIZE", (0, 0), (-1, -1), self.fontsize),
3289 ("VALIGN", (0, 0), (-1, -1), "TOP"),
3290 ("FONTNAME", (0, 0), (0, -1), "Helvetica-Bold"),
3291 ("FONTNAME", (1, 0), (1, -1), "Helvetica"),
3292 ]
3293 (self.rheader,style) = self.pdf.addCellStyling(self.rheader, style)
3294 table = Table(self.rheader,
3295 repeatRows=1,
3296 style=style,
3297 hAlign="LEFT",
3298 )
3299 content.append(table)
3300 return content
3301
3302
3303
3304 if reportLabImported:
3308 """
3309 Draw a horizontal line
3310 """
3311
3313 Flowable.__init__(self)
3314 self.lineThickness = 1
3315 if current.deployment_settings.get_pdf_size() == "Letter":
3316 self.paper_size = LETTER
3317 else:
3318 self.paper_size = A4
3319
3320
3322 canv = self.canv
3323 pagewidth, pageheight = self.paper_size
3324 self.canv.line(0, -5, pagewidth - 100, -5)
3325
3326
3327 - def wrap(self, availWidth, availHeight):
3328 self._width = availWidth
3329 self._height = self.lineThickness
3330 return self._width, self._height
3331
3397
3400 """
3401 Draw date boxes
3402 """
3403
3405 Flowable.__init__(self)
3406 self.spaceAfter = 2
3407 self.sideLength = 15
3408 self.fontsize = 10
3409 self.etreeElem = etreeElem
3410 if current.deployment_settings.get_pdf_size() == "Letter":
3411 self.paper_size = LETTER
3412 else:
3413 self.paper_size = A4
3414
3415
3417 canv = self.canv
3418 pagewidth, pageheight = self.paper_size
3419 canv.setLineWidth(0.90)
3420 canv.setStrokeGray(0.9)
3421 widthPointer = self.fontsize
3422
3423 xpadding = 6
3424 ypadding = 4
3425 margin = 50
3426
3427
3428
3429 markerOrigin = (29, 29)
3430 xCoord = pagewidth - \
3431 (self.layoutCoords[0] + xpadding + margin) - \
3432 markerOrigin[0] + \
3433 self.fontsize
3434 yCoord = pageheight - \
3435 (self.layoutCoords[1] + ypadding + margin) - \
3436 markerOrigin[1]
3437
3438 sideLength = self.sideLength
3439 rect = self.canv.rect
3440 for box in xrange(1, 11):
3441 if box not in (3, 6):
3442 rect(widthPointer,
3443 0,
3444 sideLength,
3445 sideLength)
3446 else:
3447 self.canv.drawString(widthPointer + 5,
3448 self.height,
3449 "/")
3450 widthPointer += 15
3451 getPageNumber = self.canv.getPageNumber
3452 dtformat = current.deployment_settings.get_L10n_datetime_format()
3453 if str(dtformat)[:2] == "%m":
3454
3455 DateBoxEtree = etree.SubElement(self.etreeElem,
3456 "textbox",
3457 x="%s" % xCoord,
3458 y="%s" % yCoord,
3459 side="%s" % sideLength,
3460 boxes="2",
3461 page="%s" % getPageNumber())
3462 DateBoxEtree.text = "MO"
3463 DateBoxEtree = etree.SubElement(self.etreeElem,
3464 "textbox",
3465 x="%s" % (xCoord + (sideLength * 3)),
3466 y="%s" % yCoord,
3467 side="%s" % sideLength,
3468 boxes="2",
3469 page="%s" % getPageNumber())
3470 DateBoxEtree.text = "DD"
3471 else:
3472
3473 DateBoxEtree = etree.SubElement(self.etreeElem,
3474 "textbox",
3475 x="%s" % xCoord,
3476 y="%s" % yCoord,
3477 side="%s" % sideLength,
3478 boxes="2",
3479 page="%s" % getPageNumber())
3480 DateBoxEtree.text = "DD"
3481 DateBoxEtree = etree.SubElement(self.etreeElem,
3482 "textbox",
3483 x="%s" % (xCoord + (sideLength * 3)),
3484 y="%s" % yCoord,
3485 side="%s" % sideLength,
3486 boxes="2",
3487 page="%s" % getPageNumber())
3488 DateBoxEtree.text = "MO"
3489 DateBoxEtree = etree.SubElement(self.etreeElem,
3490 "textbox",
3491 x="%s" % (xCoord + (sideLength * 6)),
3492 y="%s" % yCoord,
3493 side="%s" % sideLength,
3494 boxes="4",
3495 page="%s" % getPageNumber())
3496 DateBoxEtree.text = "YYYY"
3497
3498
3499 - def wrap(self, availWidth, availHeight):
3500 self.layoutCoords = availWidth, availHeight
3501 self._width = availWidth
3502 self._height = self.sideLength + self.spaceAfter
3503 return self._width, self._height
3504
3507 """
3508 Draw datetime boxes
3509 """
3510
3512 Flowable.__init__(self)
3513 self.spaceAfter = 2
3514 self.sideLength = 15
3515 self.fontsize = 10
3516 self.etreeElem = etreeElem
3517 if current.deployment_settings.get_pdf_size() == "Letter":
3518 self.paper_size = LETTER
3519 else:
3520 self.paper_size = A4
3521
3522
3524 canv = self.canv
3525 pagewidth, pageheight = self.paper_size
3526 canv.setLineWidth(0.90)
3527 canv.setStrokeGray(0.9)
3528 widthPointer = self.fontsize
3529
3530 xpadding = 6
3531 ypadding = 4
3532 margin = 50
3533
3534
3535
3536 markerOrigin = (29, 29)
3537 xCoord = pagewidth - \
3538 (self.layoutCoords[0] + xpadding + margin) - \
3539 markerOrigin[0]+\
3540 self.fontsize
3541 yCoord = pageheight - \
3542 (self.layoutCoords[1] + ypadding + margin) - \
3543 markerOrigin[1]
3544
3545 for box in xrange(1, 18):
3546 if box not in (3, 6, 7, 10, 13):
3547 self.canv.rect(widthPointer,
3548 0,
3549 self.sideLength,
3550 self.sideLength)
3551 widthPointer += 15
3552 DateTimeBoxEtree = etree.SubElement(self.etreeElem,
3553 "textbox",
3554 x="%s" % xCoord,
3555 y="%s" % yCoord,
3556 side="%s" % self.sideLength,
3557 boxes="2",
3558 page="%s" % self.canv.getPageNumber())
3559 DateTimeBoxEtree.text = "HH"
3560 DateTimeBoxEtree = etree.SubElement(self.etreeElem,
3561 "textbox",
3562 x="%s" % (xCoord + (self.sideLength * 3)),
3563 y="%s" % yCoord,
3564 side="%s" % self.sideLength,
3565 boxes="2",
3566 page="%s" % self.canv.getPageNumber())
3567 DateTimeBoxEtree.text = "MM"
3568 dtformat = current.deployment_settings.get_L10n_datetime_format()
3569 if str(dtformat)[:2] == "%m":
3570
3571 DateTimeBoxEtree = etree.SubElement(self.etreeElem,
3572 "textbox",
3573 x="%s" % (xCoord + (self.sideLength * 7)),
3574 y="%s" % yCoord,
3575 side="%s" % self.sideLength,
3576 boxes="2",
3577 page="%s" % self.canv.getPageNumber())
3578 DateTimeBoxEtree.text = "MO"
3579 DateTimeBoxEtree = etree.SubElement(self.etreeElem,
3580 "textbox",
3581 x="%s" % (xCoord + (self.sideLength * 10)),
3582 y="%s" % yCoord,
3583 side="%s" % self.sideLength,
3584 boxes="2",
3585 page="%s" % self.canv.getPageNumber())
3586 DateTimeBoxEtree.text = "DD"
3587 else:
3588
3589 DateTimeBoxEtree = etree.SubElement(self.etreeElem,
3590 "textbox",
3591 x="%s" % (xCoord + (self.sideLength * 7)),
3592 y="%s" % yCoord,
3593 side="%s" % self.sideLength,
3594 boxes="2",
3595 page="%s" % self.canv.getPageNumber())
3596 DateTimeBoxEtree.text = "DD"
3597 DateTimeBoxEtree = etree.SubElement(self.etreeElem,
3598 "textbox",
3599 x="%s" % (xCoord + (self.sideLength * 10)),
3600 y="%s" % yCoord,
3601 side="%s" % self.sideLength,
3602 boxes="2",
3603 page="%s" % self.canv.getPageNumber())
3604 DateTimeBoxEtree.text = "MO"
3605 DateTimeBoxEtree = etree.SubElement(self.etreeElem,
3606 "textbox",
3607 x="%s" % (xCoord + (self.sideLength * 13)),
3608 y="%s" % yCoord,
3609 side="%s" % self.sideLength,
3610 boxes="4",
3611 page="%s" % self.canv.getPageNumber())
3612 DateTimeBoxEtree.text = "YYYY"
3613
3614
3615 - def wrap(self, availWidth, availHeight):
3616 self.layoutCoords = availWidth, availHeight
3617 self._width = availWidth
3618 self._height = self.sideLength + self.spaceAfter
3619 return self._width, self._height
3620
3623 """
3624 Draw a set of Option Boxes (for Boolean or Multi-Select)
3625 - along with Labels
3626 """
3627
3628 - def __init__(self, etreeElem, labels, values):
3629 Flowable.__init__(self)
3630 self.etreeElem = etreeElem
3631 self.fontsize = 8
3632 self.spaceAfter = 2
3633 self.labels = labels
3634 self.text = labels[0]
3635 self.values = values
3636 if current.deployment_settings.get_pdf_size() == "Letter":
3637 self.paper_size = LETTER
3638 else:
3639 self.paper_size = A4
3640
3641
3643 canv = self.canv
3644 pagewidth, pageheight = self.paper_size
3645 canv.setLineWidth(0.90)
3646 canv.setStrokeGray(0.9)
3647 fontsize = self.fontsize
3648 radius = (fontsize / 2) - 1
3649
3650 xpadding = 6
3651 ypadding = 8
3652 margin = 50
3653
3654
3655
3656 markerOrigin = (29, 29)
3657 layoutCoords = self.layoutCoords
3658 pwidth = pagewidth - (layoutCoords[0] + xpadding + margin) - markerOrigin[0]
3659 pheight = pageheight - (layoutCoords[1] + ypadding + margin) - markerOrigin[1]
3660 labels = self.labels
3661 index = 0
3662 values = self.values
3663 circle = self.canv.circle
3664 drawString = self.canv.drawString
3665 getPageNumber = self.canv.getPageNumber
3666 etreeElem = self.etreeElem
3667 height = self.height
3668 cheight = height + (fontsize / 4) + 1
3669 width = self.width
3670
3671 cwidth = width + fontsize
3672
3673 _cwidth = width + fontsize
3674 _swidth = width + (fontsize * 2)
3675 for label in labels:
3676
3677 circleCenter = (_cwidth, cheight)
3678 circle(circleCenter[0],
3679 circleCenter[1],
3680 radius,
3681 fill=0)
3682
3683 drawString(_swidth, height,
3684 html_unescape_and_strip(label))
3685 xCoord = pwidth + circleCenter[0]
3686 yCoord = pheight + circleCenter[0]
3687 optionBoxEtree = etree.SubElement(etreeElem,
3688 "optionbox",
3689 x="%s" % xCoord,
3690 y="%s" % yCoord,
3691 radius="%s" % radius,
3692 boxes="1",
3693 page="%s" % getPageNumber())
3694 optionBoxEtree.set("value", values[index])
3695 optionBoxEtree.text = label
3696 xwidth = cwidth + (fontsize * (len(label) + 2)) / 1.4
3697 _cwidth += xwidth
3698 _swidth += xwidth
3699 index += 1
3700
3701
3702 - def wrap(self, availWidth, availHeight):
3703 self.layoutCoords = availWidth, availHeight
3704 width = 0
3705 for label in self.labels:
3706 width += (len(label) + 8)
3707 fontsize = self.fontsize
3708 self._width = (fontsize * width) / 2
3709 self._height = fontsize + self.spaceAfter
3710 return self._width, self._height
3711
3714 """
3715 Draw Help Text to explain how to fill out a question
3716 """
3717
3719 Flowable.__init__(self)
3720 self.text = text
3721 self.fontsize = 6
3722 self.spaceAfter = 2
3723 if current.deployment_settings.get_pdf_size() == "Letter":
3724 self.paper_size = LETTER
3725 else:
3726 self.paper_size = A4
3727
3728
3730 canv = self.canv
3731 canv.setFillGray(0.4)
3732 self.canv.drawString(self.width + (self.fontsize / 2),
3733 self.height,
3734 html_unescape_and_strip(self.text))
3735
3736
3737 - def wrap(self, availWidth, availHeight):
3738 fontsize = self.fontsize
3739 self._width = (fontsize * (len(self.text) + 4)) / 2
3740 self._height = fontsize + self.spaceAfter
3741 return self._width, self._height
3742
3743
3744
3745 _baseFontNameB = tt2ps(_baseFontName, 1, 0)
3746 _baseFontNameI = tt2ps(_baseFontName, 0, 1)
3747 _baseFontNameBI = tt2ps(_baseFontName, 1, 1)
3750 """
3751 """
3752
3753 styleSheet = getSampleStyleSheet()
3754 styleSheet.add(ParagraphStyle(name="Instructions",
3755 parent=styleSheet["Bullet"],
3756 fontName=_baseFontName,
3757 fontSize=12,
3758 firstLineIndent=0,
3759 spaceBefore=3),
3760 alias="Inst")
3761 styleSheet.add(ParagraphStyle(name="Section",
3762 parent=styleSheet["Normal"],
3763 fontName=_baseFontName,
3764 fontSize=13,
3765 spaceBefore=5,
3766 spaceAfter=5,
3767 firstLineIndent=0),
3768 alias="Sec")
3769 styleSheet.add(ParagraphStyle(name="Question",
3770 parent=styleSheet["Normal"],
3771 fontName=_baseFontName,
3772 fontSize=11,
3773 firstLineIndent=0,
3774 spaceAfter=5,
3775 spaceBefore=10),
3776 alias="Quest")
3777 styleSheet.add(ParagraphStyle(name="DefaultAnswer",
3778 parent=styleSheet["Normal"],
3779 fontName=_baseFontName,
3780 fontSize=10,
3781 firstLineIndent=0,
3782 spaceBefore=3),
3783 alias="DefAns")
3784 return styleSheet
3785
3786
3787 html_unescape_and_strip = lambda m: html_strip(html_unescape(m))
3791 """
3792 Helper function, unscape any html special characters
3793 """
3794
3795 return re.sub("&(%s);" % "|".join(name2codepoint),
3796 lambda m: unichr(name2codepoint[m.group(1)]),
3797 text)
3798
3801 """
3802 Strips html markup from text
3803 """
3804
3805 mark = 0
3806 markstart = 0
3807 markend = 0
3808 index = 0
3809 occur = 0
3810 for i in text:
3811 if i == "<":
3812 try:
3813 if text[index+1] != " ":
3814 mark = 1
3815 markstart = index
3816 except(IndexError):
3817 pass
3818 elif i == ">":
3819 if mark == 1:
3820 mark = 0
3821 markend = index
3822 text = "%s%s" % (text[:markstart], text[markend+1:])
3823 occur = 1
3824 break
3825
3826 index += 1
3827
3828 if occur == 1:
3829 text = html_strip(text)
3830
3831 return text
3832
3833
3834
3835 cast2ascii = lambda m: \
3836 m if isinstance(m, str) else unicodedata.normalize("NFKD",
3837 m).encode("ascii",
3838 "ignore")
3842 """
3843 Image Parsing and OCR Utility
3844 """
3845
3847 """
3848 Intialise class instance with environment variables and functions
3849 """
3850
3851 self.r = r
3852 self.request = current.request
3853 checkDependencies(r)
3854
3855
3856 - def parse(self, form_uuid, set_uuid, **kwargs):
3857 """
3858 Performs OCR on a given set of pages
3859 """
3860
3861 raw_images = {}
3862 images = {}
3863
3864 self.set_uuid = set_uuid
3865 db = current.db
3866 T = current.T
3867 request = self.request
3868
3869
3870 metatable = "ocr_meta"
3871 query = (db[metatable]["form_uuid"] == form_uuid)
3872 row = db(query).select(limitby=(0, 1)).first()
3873 revision = row["revision"]
3874 resourcename = row["resource_name"]
3875 layoutfilename = row["layout_file"]
3876 pages = int(row["pages"])
3877 is_component = True if len(self.r.resource.components) == 1 else False
3878
3879
3880 for eachpage in xrange(1, pages+1):
3881 payloadtable = "ocr_payload"
3882 row =\
3883 db((db[payloadtable]["image_set_uuid"]==set_uuid) &\
3884 (db[payloadtable]["page_number"]==eachpage)
3885 ).select().first()
3886
3887 pageimagefile = row["image_file"]
3888 raw_images[eachpage] =\
3889 Image.open(os.path.join(self.r.folder,
3890 "uploads",
3891 "ocr_payload",
3892 pageimagefile))
3893
3894
3895 for each_img_index in raw_images.keys():
3896 images[each_img_index] = {}
3897 images[each_img_index]["image"] =\
3898 self.__convertImage2binary(raw_images[each_img_index])
3899 images[each_img_index]["markers"] =\
3900 self.__getMarkers(images[each_img_index]["image"])
3901 images[each_img_index]["orientation"] =\
3902 self.__getOrientation(images[each_img_index]["markers"])
3903 if images[each_img_index]["orientation"] != 0.0:
3904 images[each_img_index]["image"] =\
3905 images[each_img_index]["image"].rotate(images[each_img_index]["orientation"])
3906 images[each_img_index]["markers"] =\
3907 self.__getMarkers(images[each_img_index]["image"])
3908 images[each_img_index]["orientation"] =\
3909 self.__getOrientation(images[each_img_index]["markers"])
3910
3911 images[each_img_index]["scalefactor"] =\
3912 self.__scaleFactor(images[each_img_index]["markers"])
3913
3914
3915 layout_file = open(os.path.join(self.r.folder,
3916 "uploads",
3917 "ocr_meta",
3918 layoutfilename),
3919 "rb")
3920 layout_xml = layout_file.read()
3921 layout_file.close()
3922 layout_etree = etree.fromstring(layout_xml)
3923
3924
3925 s3xml_root_etree = etree.Element("s3xml")
3926 parent_resource_exist = False
3927
3928 SubElement = etree.SubElement
3929 for resource in layout_etree:
3930
3931 if not is_component:
3932 if parent_resource_exist == False:
3933 s3xml_parent_resource_etree = SubElement(s3xml_root_etree,
3934 "resource")
3935 s3xml_resource_etree = s3xml_parent_resource_etree
3936 parent_resource_exist = True
3937 else:
3938 s3xml_resource_etree = SubElement(s3xml_parent_resource_etree,
3939 "resource")
3940 else:
3941 s3xml_resource_etree = SubElement(s3xml_root_etree,
3942 "resource")
3943
3944 s3xml_resource_etree.set("name",
3945 resource.attrib.get("name", None))
3946
3947 for field in resource:
3948 field_name = field.attrib.get("name", None)
3949 field_type = field.attrib.get("type", None)
3950 field_reference = field.attrib.get("reference")
3951
3952 if field_reference == "1":
3953 field_is_reference = True
3954 field_resource = field.attrib.get("resource")
3955 else:
3956 field_is_reference = False
3957
3958
3959 if field_is_reference:
3960 s3xml_reference_etree = SubElement(s3xml_resource_etree,
3961 "reference")
3962 s3xml_reference_etree.set("field", field_name)
3963 s3xml_reference_etree.set("resource", field_resource)
3964
3965 s3xml_sub_reference_etree = SubElement(s3xml_reference_etree,
3966 "resource")
3967 s3xml_sub_reference_etree.set("name", field_resource)
3968
3969 s3xml_field_etree = SubElement(s3xml_sub_reference_etree,
3970 "data")
3971 s3xml_field_etree.set("field", "name")
3972
3973 else:
3974 s3xml_field_etree = SubElement(s3xml_resource_etree,
3975 "data")
3976 s3xml_field_etree.set("field", field_name)
3977
3978
3979 components = field.getchildren()
3980 numcomponents = len(components)
3981 null_field = False
3982 if numcomponents == 0:
3983 continue
3984 else:
3985 component_type = components[0].tag
3986 if component_type in ("optionbox", "textbox"):
3987 if component_type == "optionbox":
3988 linenum = 0
3989 OCRText = []
3990 OCRValue = []
3991 for component in components:
3992 get = component.attrib.get
3993 comp_x = float(get("x"))
3994 comp_y = float(get("y"))
3995 comp_boxes = int(get("boxes"))
3996 comp_radius = float(get("radius"))
3997 comp_page = int(get("page"))
3998 comp_value = str(get("value"))
3999 comp_text = str(component.text)
4000 try:
4001 page_origin = images[comp_page]["markers"]
4002 except(KeyError):
4003 self.r.error(501,
4004 T("insufficient number of pages provided"))
4005 crop_box = (
4006 int(page_origin[0][0]+\
4007 (comp_x*\
4008 images[comp_page]["scalefactor"]["x"])-\
4009 comp_radius*images[comp_page]["scalefactor"]["x"]),
4010 int(page_origin[0][1]+\
4011 (comp_y*\
4012 images[comp_page]["scalefactor"]["y"])-\
4013 comp_radius*images[comp_page]["scalefactor"]["y"]),
4014 int(page_origin[0][0]+\
4015 (comp_x*\
4016 images[comp_page]["scalefactor"]["x"])+\
4017 comp_radius*images[comp_page]["scalefactor"]["x"]),
4018 int(page_origin[0][1]+\
4019 (comp_y*\
4020 images[comp_page]["scalefactor"]["y"])+\
4021 comp_radius*images[comp_page]["scalefactor"]["y"]),
4022 )
4023 temp_image = images[comp_page]["image"].crop(crop_box)
4024 cropped_image = images[comp_page]["image"].crop(crop_box)
4025 result = self.__ocrIt(cropped_image,
4026 form_uuid,
4027 resourcename,
4028 linenum,
4029 content_type="optionbox",
4030 resource_table=resource.attrib.get("name"),
4031 field_name=field.attrib.get("name"),
4032 field_value=comp_value)
4033 if result:
4034 OCRText.append(unicode.strip(comp_text.decode("utf-8")))
4035 OCRValue.append(unicode.strip(comp_value.decode("utf-8")))
4036
4037 linenum += 1
4038
4039
4040 if len(OCRValue) in [0, 1]:
4041 uOCRValue = "|".join(OCRValue)
4042 uOCRText = "|".join(OCRText)
4043 else:
4044 uOCRValue = "|%s|" % "|".join(OCRValue)
4045 uOCRText = "|%s|" % "|".join(OCRText)
4046
4047 s3xml_field_etree.set("value", uOCRValue)
4048 s3xml_field_etree.text = uOCRText
4049
4050 if len(OCRValue) == 0:
4051 null_field = True
4052 else:
4053 null_field = False
4054
4055 elif component_type == "textbox":
4056 linenum = 1
4057 if field_type in ["date", "datetime"]:
4058
4059 OCRedValues = {}
4060 comp_count = 1
4061 for component in components:
4062 get = component.attrib.get
4063 comp_x = float(get("x"))
4064 comp_y = float(get("y"))
4065 comp_boxes = int(get("boxes"))
4066 comp_side = float(get("side"))
4067 comp_page = int(get("page"))
4068 comp_meta = str(component.text)
4069 try:
4070 page_origin = images[comp_page]["markers"]
4071 except(KeyError):
4072 self.r.error(501,
4073 T("insufficient number of pages provided"))
4074 crop_box = (
4075 int(page_origin[0][0]+\
4076 (comp_x*\
4077 images[comp_page]["scalefactor"]["x"])),
4078 int(page_origin[0][1]+\
4079 (comp_y*\
4080 images[comp_page]["scalefactor"]["y"])),
4081 int(page_origin[0][0]+\
4082 (comp_x*\
4083 images[comp_page]["scalefactor"]["x"])+\
4084 comp_side*comp_boxes*images[comp_page]["scalefactor"]["x"]),
4085 int(page_origin[0][1]+\
4086 (comp_y*\
4087 images[comp_page]["scalefactor"]["y"])+\
4088 comp_side*images[comp_page]["scalefactor"]["y"]),
4089 )
4090 cropped_image = images[comp_page]["image"].crop(crop_box)
4091 output = self.__ocrIt(cropped_image,
4092 form_uuid,
4093 resourcename,
4094 linenum,
4095 resource_table=resource.attrib.get("name"),
4096 field_name=field.attrib.get("name"),
4097 field_seq=comp_count)
4098 linenum += 1
4099 comp_count += 1
4100
4101 OCRedValues[comp_meta] = unicode.strip(output.decode("utf-8"))
4102
4103
4104 yyyy = datetime.now().year
4105 try:
4106 if int(OCRedValues["YYYY"]) in range(1800, 2300):
4107 yyyy = int(OCRedValues["YYYY"])
4108 except:
4109 pass
4110
4111 if yyyy % 4 == 0:
4112 leapyear = True
4113 else:
4114 leapyear = False
4115
4116
4117 try:
4118 if int(OCRedValues["MO"]) in range(1, 13):
4119 mo = int(OCRedValues["MO"])
4120 except:
4121 mo = 1
4122
4123
4124 try:
4125 if int(OCRedValues["DD"]) in range(1, 32):
4126 dd = int(OCRedValues["DD"])
4127 except:
4128 dd = 1
4129
4130 if mo in [4, 6, 9, 11]:
4131 if dd == 31:
4132 dd = 1
4133 elif mo == 2:
4134 if leapyear:
4135 if dd > 29:
4136 dd = 1
4137 else:
4138 if dd > 28:
4139 dd = 1
4140
4141 if field_type == "datetime":
4142
4143 try:
4144 if int(OCRedValues["MM"]) in range(0, 60):
4145 mm = int(OCRedValues["MM"])
4146 except:
4147 mm = 0
4148
4149
4150 try:
4151 if int(OCRedValues["HH"]) in range(0, 24):
4152 hh = int(OCRedValues["HH"])
4153 except:
4154 hh = 0
4155
4156 if field_type == "date":
4157 s3xml_field_etree.set("value",
4158 "%s-%s-%s" % (yyyy, mo, dd))
4159 s3xml_field_etree.text =\
4160 "%s-%s-%s" % (yyyy, mo, dd)
4161
4162 elif field_type == "datetime":
4163 utctime = self.__convert_utc(yyyy, mo, dd, hh, mm)
4164 utcftime = utctime.strftime("%Y-%m-%dT%H:%M:%SZ")
4165 s3xml_field_etree.set("value", utcftime)
4166 s3xml_field_etree.text = utcftime
4167
4168 else:
4169
4170 ocrText = ""
4171 comp_count = 1
4172 for component in components:
4173 comp_x = float(component.attrib.get("x"))
4174 comp_y = float(component.attrib.get("y"))
4175 comp_boxes = int(component.attrib.get("boxes"))
4176 comp_side = float(component.attrib.get("side"))
4177 comp_page = int(component.attrib.get("page"))
4178 comp_meta = str(component.text)
4179 try:
4180 page_origin = images[comp_page]["markers"]
4181 except(KeyError):
4182 self.r.error(501,
4183 T("insufficient number of pages provided"))
4184 crop_box = (
4185 int(page_origin[0][0]+\
4186 (comp_x*\
4187 images[comp_page]["scalefactor"]["x"])),
4188 int(page_origin[0][1]+\
4189 (comp_y*\
4190 images[comp_page]["scalefactor"]["y"])),
4191 int(page_origin[0][0]+\
4192 (comp_x*\
4193 images[comp_page]["scalefactor"]["x"])+\
4194 comp_side*comp_boxes*images[comp_page]["scalefactor"]["x"]),
4195 int(page_origin[0][1]+\
4196 (comp_y*\
4197 images[comp_page]["scalefactor"]["y"])+\
4198 comp_side*images[comp_page]["scalefactor"]["y"]),
4199 )
4200 cropped_image = images[comp_page]["image"].crop(crop_box)
4201 output = self.__ocrIt(cropped_image,
4202 form_uuid,
4203 resourcename,
4204 linenum,
4205 resource_table=resource.attrib.get("name"),
4206 field_name=field.attrib.get("name"),
4207 field_seq=comp_count)
4208 ocrText += output
4209 linenum += 1
4210 comp_count += 1
4211
4212 output = unicode.strip(ocrText.decode("utf-8"))
4213
4214 if field_type in ["double", "integer"]:
4215 try:
4216 output = int(self.__strip_spaces(output))
4217 except:
4218 output = 0
4219 s3xml_field_etree.set("value",
4220 "%s" % output)
4221 s3xml_field_etree.text =\
4222 "%s" % output
4223 else:
4224 s3xml_field_etree.text = output
4225
4226 if len("%s" % output) == 0:
4227 null_field = True
4228 else:
4229 null_field = False
4230
4231 else:
4232 continue
4233
4234 if null_field:
4235 if field_is_reference:
4236 s3xml_resource_etree.remove(s3xml_reference_etree)
4237
4238 else:
4239 s3xml_resource_etree.remove(s3xml_field_etree)
4240
4241 output = etree.tostring(s3xml_root_etree, pretty_print=True)
4242 return output
4243
4244
4246 """
4247 Remove all spaces from a string
4248 """
4249
4250 try:
4251 text = "".join(text.split())
4252 except:
4253 pass
4254
4255 return text
4256
4257
4258 - def __convert_utc(self,
4259 yyyy,
4260 mo,
4261 dd,
4262 hh,
4263 mm):
4264 """
4265 Convert local time to UTC
4266 """
4267
4268 timetuple = datetime.strptime("%s-%s-%s %s:%s:00" % (yyyy,
4269 mo,
4270 dd,
4271 hh,
4272 mm),
4273 "%Y-%m-%d %H:%M:%S")
4274 auth = current.auth
4275 if auth.user:
4276 utc_offset = auth.user.utc_offset
4277 else:
4278 utc_offset = None
4279 try:
4280 t = utc_offset.split()[1]
4281 if len(t) == 5:
4282 sign = t[0]
4283 hours = t[1:3]
4284 minutes = t[3:5]
4285 tdelta = timedelta(hours=int(hours), minutes=int(minutes))
4286 if sign == "+":
4287 utctime = timetuple - tdelta
4288 elif sign == "-":
4289 utctime = timetuple + tdelta
4290 except:
4291 utctime = timetuple
4292
4293 return utctime
4294
4295
4296 - def __ocrIt(self,
4297 image,
4298 form_uuid,
4299 resourcename,
4300 linenum,
4301 content_type="textbox",
4302 **kwargs):
4303 """
4304 Put Tesseract to work, actual OCRing will be done here
4305 """
4306
4307 db = current.db
4308 ocr_field_crops = "ocr_field_crops"
4309 import uuid
4310 uniqueuuid = uuid.uuid1()
4311
4312 resource_table = kwargs.get("resource_table")
4313 field_name = kwargs.get("field_name")
4314
4315 inputfilename = "%s_%s_%s_%s.tif" % (uniqueuuid,
4316 form_uuid,
4317 resourcename,
4318 linenum)
4319 outputfilename = "%s_%s_%s_%s_text" % (uniqueuuid,
4320 form_uuid,
4321 resourcename,
4322 linenum)
4323
4324 ocr_temp_dir = os.path.join(self.r.folder, "uploads", "ocr_temp")
4325
4326 try:
4327 os.mkdir(ocr_temp_dir)
4328 except(OSError):
4329 pass
4330
4331 if content_type == "optionbox":
4332 field_value = kwargs.get("field_value")
4333 imgfilename = "%s.png" % inputfilename[:-3]
4334 imgpath = os.path.join(ocr_temp_dir, imgfilename)
4335 image.save(imgpath)
4336 imgfile = open(imgpath, "r")
4337 db[ocr_field_crops].insert(image_set_uuid=self.set_uuid,
4338 resource_table=resource_table,
4339 field_name=field_name,
4340 image_file=db[ocr_field_crops]["image_file"].store(imgfile,
4341 imgfilename),
4342 value=field_value)
4343 imgfile.close()
4344 os.remove(imgpath)
4345
4346 stat = ImageStat.Stat(image)
4347 if stat.mean[0] < 96 :
4348 return True
4349 else:
4350 return None
4351
4352 elif content_type == "textbox":
4353 field_seq = kwargs.get("field_seq")
4354
4355 inputpath = os.path.join(ocr_temp_dir, inputfilename)
4356 image.save(inputpath)
4357
4358 success =\
4359 subprocess.call(["tesseract", inputpath,
4360 os.path.join(ocr_temp_dir, outputfilename)])
4361 if success != 0:
4362 self.r.error(501, ERROR.TESSERACT_ERROR)
4363 outputpath = os.path.join(ocr_temp_dir, "%s.txt" % outputfilename)
4364 outputfile = open(outputpath)
4365 outputtext = outputfile.read()
4366 outputfile.close()
4367 output = outputtext.replace("\n", " ")
4368 os.remove(outputpath)
4369 imgfilename = "%s.png" % inputfilename[:-3]
4370 imgpath = os.path.join(ocr_temp_dir, imgfilename)
4371 image.save(imgpath)
4372 imgfile = open(imgpath, "r")
4373 db[ocr_field_crops].insert(image_set_uuid=self.set_uuid,
4374 resource_table=resource_table,
4375 field_name=field_name,
4376 image_file=db[ocr_field_crops]["image_file"].store(imgfile,
4377 imgfilename),
4378 sequence=field_seq)
4379 imgfile.close()
4380 os.remove(imgpath)
4381 os.remove(inputpath)
4382
4383 try:
4384 os.rmdir(ocr_temp_dir)
4385 except(OSError):
4386 import shutil
4387 shutil.rmtree(ocr_temp_dir)
4388 return output
4389
4390
4392 """
4393 Converts the image into binary based on a threshold. here it is 180
4394 """
4395
4396 image = ImageOps.grayscale(image)
4397 image.convert("L")
4398
4399 width, height = image.size
4400
4401 for x in xrange(width):
4402 for y in xrange(height):
4403 if image.getpixel((x,y)) < 180 :
4404 image.putpixel((x,y), 0)
4405 else:
4406 image.putpixel((x,y), 255)
4407 return image
4408
4409
4411 """
4412 Return the list of regions which are found by the following algorithm.
4413
4414 -----------------------------------------------------------
4415 Raster Scanning Algorithm for Connected Component Analysis:
4416 -----------------------------------------------------------
4417
4418 On the first pass:
4419 =================
4420 1. Iterate through each element of the data by column, then by row (Raster Scanning)
4421 2. If the element is not the background
4422 1. Get the neighboring elements of the current element
4423 2. If there are no neighbors, uniquely label the current element and continue
4424 3. Otherwise, find the neighbor with the smallest label and assign it to the current element
4425 4. Store the equivalence between neighboring labels
4426
4427 On the second pass:
4428 ===================
4429 1. Iterate through each element of the data by column, then by row
4430 2. If the element is not the background
4431 1. Relabel the element with the lowest equivalent label
4432 ( source: http://en.wikipedia.org/wiki/Connected_Component_Labeling )
4433 """
4434
4435 width, height = im.size
4436 ImageOps.grayscale(im)
4437 im = im.convert("L")
4438
4439 regions = {}
4440 pixel_region = [[0 for y in xrange(height)] for x in xrange(width)]
4441 equivalences = {}
4442 n_regions = 0
4443
4444
4445 for x in xrange(width):
4446 for y in xrange(height):
4447
4448 if im.getpixel((x, y)) == 0 :
4449
4450 region_n = pixel_region[x-1][y] if x > 0 else 0
4451 region_w = pixel_region[x][y-1] if y > 0 else 0
4452
4453
4454
4455 max_region = max(region_n, region_w)
4456
4457 if max_region > 0:
4458
4459 new_region = min(filter(lambda i: i > 0, (region_n, region_w)))
4460
4461 if max_region > new_region:
4462 if max_region in equivalences:
4463 equivalences[max_region].add(new_region)
4464 else:
4465 equivalences[max_region] = set((new_region, ))
4466 else:
4467 n_regions += 1
4468 new_region = n_regions
4469
4470 pixel_region[x][y] = new_region
4471
4472
4473 for x in xrange(width):
4474 for y in xrange(height):
4475 r = pixel_region[x][y]
4476 if r > 0:
4477 while r in equivalences:
4478 r = min(equivalences[r])
4479
4480 if r in regions:
4481 regions[r].add(x, y)
4482 else:
4483 regions[r] = self.__Region(x, y)
4484
4485 return list(regions.itervalues())
4486
4487
4489 """
4490 Returns orientation of the sheet in radians
4491 """
4492
4493 x1, y1 = markers[0]
4494 x2, y2 = markers[2]
4495 try:
4496 slope = ((x2 - x1) * 1.0) / ((y2 - y1) * 1.0)
4497 except(ZeroDivisionError):
4498 slope = 999999999999999999999999999
4499 return math.atan(slope) * (180.0 / math.pi) * (-1)
4500
4501
4503 """
4504 Returns the scale factors lengthwise and breadthwise
4505 """
4506
4507 stdWidth = sum((596, -60))
4508 stdHeight = sum((842, -60))
4509 li = [markers[0], markers[2]]
4510 sf_y = self.__distance(li)/stdHeight
4511 li = [markers[6], markers[2]]
4512 sf_x = self.__distance(li)/stdWidth
4513 return {"x": sf_x,
4514 "y": sf_y
4515 }
4516
4517
4519 """
4520 Returns the euclidean distance if the input is of the form [(x1, y1), (x2, y2)]
4521 """
4522
4523 return math.sqrt(math.fsum((math.pow(math.fsum((int(li[1][0]), -int(li[0][0]))), 2), math.pow(math.fsum((int(li[1][1]), -int(li[0][1]))), 2))))
4524
4525
4527 """
4528 Gets the markers on the OCR image
4529 """
4530
4531 centers = {}
4532 present = 0
4533
4534 regions = self.__findRegions(image)
4535
4536 for r in regions:
4537 if r.area > 320 and r.aspectratio() < 1.5 and r.aspectratio() > 0.67:
4538 present += 1
4539 centers[present] = r.centroid()
4540
4541
4542 markers = list(centers.itervalues())
4543 markers.sort()
4544 l1 = sorted(markers[0:3], key=lambda y: y[1])
4545 l2 = markers[3:4]
4546 l3 = sorted(markers[4:7], key=lambda y: y[1])
4547 markers = []
4548 markers.extend(l1)
4549 markers.extend(l2)
4550 markers.extend(l3)
4551
4552 return markers
4553
4554
4556 """
4557 """
4558
4560 """ Initialize the region """
4561 self._pixels = [(x, y)]
4562 self._min_x = x
4563 self._max_x = x
4564 self._min_y = y
4565 self._max_y = y
4566 self.area = 1
4567
4568
4569 - def add(self, x, y):
4570 """ Add a pixel to the region """
4571 self._pixels.append((x, y))
4572 self.area += 1
4573 self._min_x = min(self._min_x, x)
4574 self._max_x = max(self._max_x, x)
4575 self._min_y = min(self._min_y, y)
4576 self._max_y = max(self._max_y, y)
4577
4578
4580 """ Returns the centroid of the bounding box """
4581 return ((self._min_x + self._max_x) / 2,
4582 (self._min_y + self._max_y) / 2)
4583
4584
4586 """ Returns the bounding box of the region """
4587 return [ (self._min_x, self._min_y) , (self._max_x, self._max_y)]
4588
4589
4591 """ Calculating the aspect ratio of the region """
4592 width = self._max_x - self._min_x
4593 length = self._max_y - self._min_y
4594 return float(width)/float(length)
4595
4596
4597