python-llsd/llsd/base.py at 74daf22a3b024b0df680734070512b3b121bfb12 · secondlife/python-llsd · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
import abc
import base64
import binascii
import datetime
import io
import os
import re
import sys
import types
import uuid

try:
    # If the future package is installed, then we support it.  Any clients in
    # python 2 using its str builtin replacement will actually be using instances
    # of newstr, so we need to properly detect that as a string type
    # for details see the docs: http://python-future.org/str_object.html
    from future.types.newstr import newstr
except ImportError:
    # otherwise we pass over it in silence
    newstr = str

PY2 = sys.version_info[0] == 2

XML_MIME_TYPE = 'application/llsd+xml'
BINARY_MIME_TYPE = 'application/llsd+binary'
NOTATION_MIME_TYPE = 'application/llsd+notation'

XML_HEADER = b'<? llsd/xml ?>'
BINARY_HEADER = b'<? llsd/binary ?>'
NOTATION_HEADER = b'<? llsd/notation ?>'

ALL_CHARS = str(bytearray(range(256))) if PY2 else bytes(range(256))

MAX_FORMAT_DEPTH = 200
MAX_PARSE_DEPTH = 200

class _LLSD:
    __metaclass__ = abc.ABCMeta

    def __init__(self, thing=None):
        self.thing = thing


undef = _LLSD(None)


# 'binary' only exists so that a Python 2 caller can distinguish binary data
# from str data - since in Python 2, (bytes is str).
if PY2:
    class binary(str):
        "Simple wrapper for llsd.binary data."
        pass
else:
    binary = bytes


class uri(str):
    "Simple wrapper for llsd.uri data."
    pass


class LLSDParseError(Exception):
    "Exception raised when the parser fails."
    pass


class LLSDSerializationError(TypeError):
    "Exception raised when serialization fails."
    pass


# In Python 2, this expression produces (str, unicode); in Python 3 it's
# simply (str,). Either way, it's valid to test isinstance(somevar,
# StringTypes). (Some consumers test (type(somevar) in StringTypes), so we do
# want (str,) rather than plain str.)
StringTypes = tuple(set((type(''), type(u''), newstr)))

try:
    LongType = long
    IntTypes = (int, long)
except NameError:
    LongType = int
    IntTypes = int

try:
    UnicodeType = unicode
except NameError:
    UnicodeType = str

try:
    b'%s' % (b'yes',)
except TypeError:
    # There's a range of Python 3 versions, up through Python 3.4, for which
    # bytes interpolation (bytes value with % operator) does not work. This
    # hack can be removed once we no longer care about Python 3.4 -- in other
    # words, once we're beyond jessie everywhere.
    class B(object):
        """
        Instead of writing:
        b'format string' % stuff
        write:
        B('format string') % stuff
        This class performs the conversions necessary to support bytes
        interpolation when the language doesn't natively support it.
        (We considered naming this class b, but that would be too confusing.)
        """
        def __init__(self, fmt):
            # Instead of storing the format string as bytes and converting it
            # to string every time, convert initially and store the string.
            try:
                self.strfmt = fmt.decode('utf-8')
            except AttributeError:
                # caller passed a string literal rather than a bytes literal
                self.strfmt = fmt

        def __mod__(self, args):
            # __mod__() is engaged for (self % args)
            if not isinstance(args, tuple):
                # Unify the tuple and non-tuple cases.
                args = (args,)
            # In principle, this is simple: convert everything to string,
            # interpolate, convert back. It's complicated by the fact that we
            # must handle non-bytes args.
            strargs = []
            for arg in args:
                try:
                    decoder = arg.decode
                except AttributeError:
                    # use arg exactly as is
                    strargs.append(arg)
                else:
                    # convert from bytes to string
                    strargs.append(decoder('utf-8'))
            return (self.strfmt % tuple(strargs)).encode('utf-8')
else:
    # bytes interpolation Just Works
    def B(fmt):
        try:
            # In the usual case, caller wrote B('fmt') rather than b'fmt'. But
            # s/he really wants a bytes literal here. Encode the passed string.
            return fmt.encode('utf-8')
        except AttributeError:
            # Caller wrote B(b'fmt')?
            return fmt


def is_integer(o):
    """ portable test if an object is like an int """
    return isinstance(o, IntTypes)


def is_unicode(o):
    """ portable check if an object is unicode and not bytes """
    return isinstance(o, UnicodeType)


def is_string(o):
    """ portable check if an object is string-like """
    return isinstance(o, StringTypes)


#date: d"YYYY-MM-DDTHH:MM:SS.FFFFFFZ"
_date_regex = re.compile(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T"
                        r"(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})"
                        r"(?P<second_float>(\.\d+)?)Z")


def _str_to_bytes(s):
    if is_unicode(s):
        return s.encode('utf-8')
    else:
        return s


def _format_datestr(v):
    """
    Formats a datetime or date object into the string format shared by
    xml and notation serializations.
    """
    if not isinstance(v, datetime.date) and not isinstance(v, datetime.datetime):
        raise LLSDSerializationError("invalid date string %s passed to date formatter" % v)

    if not isinstance(v, datetime.datetime):
        v = datetime.datetime.combine(v, datetime.time(0))

    return _str_to_bytes(v.isoformat() + 'Z')


def _parse_datestr(datestr):
    """
    Parses a datetime object from the string format shared by
    xml and notation serializations.
    """
    if datestr == "":
        return datetime.datetime(1970, 1, 1)

    match = re.match(_date_regex, datestr)
    if not match:
        raise LLSDParseError("invalid date string '%s'." % datestr)

    year = int(match.group('year'))
    month = int(match.group('month'))
    day = int(match.group('day'))
    hour = int(match.group('hour'))
    minute = int(match.group('minute'))
    second = int(match.group('second'))
    seconds_float = match.group('second_float')
    usec = 0
    if seconds_float:
        usec = int(float('0' + seconds_float) * 1e6)
    return datetime.datetime(year, month, day, hour, minute, second, usec)


def _bool_to_python(node, depth=0):
    "Convert boolean node to a python object."
    val = node.text or ''
    try:
        # string value, accept 'true' or 'True' or whatever
        return (val.lower() in ('true', '1', '1.0'))
    except AttributeError:
       # not a string (no lower() method), use normal Python rules
       return bool(val)


def _int_to_python(node, depth=0):
    "Convert integer node to a python object."
    val = node.text or ''
    if not val.strip():
        return 0
    return int(val)


def _real_to_python(node, depth=0):
    "Convert floating point node to a python object."
    val = node.text or ''
    if not val.strip():
        return 0.0
    return float(val)


def _uuid_to_python(node, depth=0):
    "Convert uuid node to a python object."
    if node.text:
        return uuid.UUID(hex=node.text)
    return uuid.UUID(int=0)


def _str_to_python(node, depth=0):
    "Convert string node to a python object."
    return node.text or ''


def _bin_to_python(node, depth=0):
    base = node.get('encoding') or 'base64'
    try:
        if base == 'base16':
            # parse base16 encoded data
            return binary(base64.b16decode(node.text or ''))
        elif base == 'base64':
            # parse base64 encoded data
            return binary(base64.b64decode(node.text or ''))
        elif base == 'base85':
            return LLSDParseError("Parser doesn't support base85 encoding")
    except binascii.Error as exc:
        # convert exception class so it's more catchable
        return LLSDParseError("Encoded binary data: " + str(exc))
    except TypeError as exc:
        # convert exception class so it's more catchable
        return LLSDParseError("Bad binary data: " + str(exc))


def _date_to_python(node, depth=0):
    "Convert date node to a python object."
    val = node.text or ''
    if not val:
        val = "1970-01-01T00:00:00Z"
    return _parse_datestr(val)


def _uri_to_python(node, depth=0):
    "Convert uri node to a python object."
    val = node.text or ''
    return uri(val)


def _map_to_python(node, depth=0):
    "Convert map node to a python object."
    result = {}
    for index in range(len(node))[::2]:
        if node[index].text is None:
            result[''] = _to_python(node[index+1], depth+1)
        else:
            result[node[index].text] = _to_python(node[index+1], depth+1)
    return result


def _array_to_python(node, depth=0):
    "Convert array node to a python object."
    return [_to_python(child, depth+1) for child in node]


NODE_HANDLERS = dict(
    undef=lambda x,y: None,
    boolean=_bool_to_python,
    integer=_int_to_python,
    real=_real_to_python,
    uuid=_uuid_to_python,
    string=_str_to_python,
    binary=_bin_to_python,
    date=_date_to_python,
    uri=_uri_to_python,
    map=_map_to_python,
    array=_array_to_python,
)


def _to_python(node, depth=0):
    "Convert node to a python object."
    if depth > MAX_PARSE_DEPTH:
        raise LLSDParseError("Cannot parse depth of more than %d" % MAX_PARSE_DEPTH)

    return NODE_HANDLERS[node.tag](node, depth)


class LLSDBaseFormatter(object):
    """
    This base class cannot be instantiated on its own: it assumes a subclass
    containing methods with canonical names specified in self.__init__(). The
    role of this base class is to provide self.type_map based on the methods
    defined in its subclass.
    """
    __slots__ = ['stream', 'type_map']

    def __init__(self):
        "Construct a new formatter dispatch table."
        self.stream = None
        self.type_map = {
            type(None):          self._UNDEF,
            undef:               self._UNDEF,
            bool:                self._BOOLEAN,
            int:                 self._INTEGER,
            LongType:            self._INTEGER,
            float:               self._REAL,
            uuid.UUID:           self._UUID,
            binary:              self._BINARY,
            str:                 self._STRING,
            UnicodeType:         self._STRING,
            newstr:              self._STRING,
            uri:                 self._URI,
            datetime.datetime:   self._DATE,
            datetime.date:       self._DATE,
            list:                self._ARRAY,
            tuple:               self._ARRAY,
            types.GeneratorType: self._ARRAY,
            dict:                self._MAP,
            _LLSD:               self._LLSD,
        }


    def format(self, something):
        """
        Pure Python implementation of the formatter.
        Format a python object according to subclass formatting.

        :param something: A python object (typically a dict) to be serialized.
        :returns: A serialized bytes object.
        """
        stream = io.BytesIO()
        self.write(stream, something)
        return stream.getvalue()

    def write(self, stream, something):
        """
        Serialize a python object to the passed binary 'stream' according to
        subclass formatting.

        :param stream: A binary file-like object to which to serialize 'something'.
        :param something: A python object (typically a dict) to be serialized.
        """
        self.stream = stream
        try:
            return self._write(something)
        finally:
            self.stream = None


_X_ORD = ord(b'x')
_BACKSLASH_ORD = ord(b'\\')
_DECODE_BUFF_ALLOC_SIZE = 1024


class LLSDBaseParser(object):
    """
    Utility methods useful for parser subclasses.
    """
    __slots__ = ['_stream', '_decode_buff']

    def __init__(self, something=b''):
        self._reset(something)
        # Scratch space for decoding delimited strings
        self._decode_buff = bytearray(_DECODE_BUFF_ALLOC_SIZE)

    def _reset(self, something):
        if isinstance(something, LLSDBaseParser):
            # When passed an existing LLSDBaseParser (subclass) instance, just
            # borrow its existing _stream.
            self._stream = something._stream
        elif isinstance(something, bytes):
            # Wrap an incoming bytes string into a stream. If the passed bytes
            # string is so large that the overhead of copying it into a
            # BytesIO is significant, advise caller to pass a stream instead.
            self._stream = io.BytesIO(something)
        elif isinstance(something, io.IOBase):
            # 'something' is a proper IO stream - must be seekable for parsing
            if something.seekable():
                self._stream = something
            else:
                raise LLSDParseError(
                    "Cannot parse LLSD from non-seekable stream."
                )
        else:
            # Invalid input type - raise a clear error
            # This catches MagicMock and other non-stream objects that might
            # have read/seek attributes but aren't actual IO streams
            raise LLSDParseError(
                "Cannot parse LLSD from {0}. "
                "Expected bytes or a seekable io.IOBase object.".format(
                    type(something).__name__
                )
            )

    def starts_with(self, pattern):
        """
        Like matchseq(), except that starts_with() doesn't consume what it
        matches: it always resets our input stream to its previous position.
        """
        oldpos = self._stream.tell()
        try:
            return self.matchseq(pattern)
        finally:
            self._stream.seek(oldpos)

    def matchseq(self, pattern):
        """
        Match bytes object 'pattern' after skipping arbitrary leading
        whitespace. After successfully matching 'pattern', skip trailing
        whitespace as well.

        'pattern' is NOT a regular expression, but a bytes string in which
        each space character matches zero or more whitespace characters in the
        stream. Non-space characters are matched case-insensitively.

        If 'pattern' matches, return True and leave our input stream advanced
        past the last byte examined.

        If 'pattern' does not match, return False and reset our input stream
        to its previous read position.
        """
        oldpos = self._stream.tell()
        for chunk in pattern.split():
            # skip leading space before this chunk
            c = self._next_nonblank()
            # if we hit EOF, no match
            if not c:
                self._stream.seek(oldpos)
                return False
            # not EOF: try to match non-empty chunk,
            # not forgetting that 'c' is a lookahead byte
            # (split() never produces a zero-length chunk)
            maybe = c + self._stream.read(len(chunk)-1)
            if maybe.lower() != chunk.lower():
                # mismatch, reset
                self._stream.seek(oldpos)
                return False
            # so far so good, back for next chunk

        # here we've matched every chunk, with the read pointer just at the end of
        # the last matched chunk -- skip trailing space
        if self._next_nonblank():
            # back up one character, i.e. put back the nonblank
            self._stream.seek(-1, io.SEEK_CUR)
        # success!
        return True

    def remainder(self):
        # return a stream object representing the parse input (from last
        # _reset() call), whose read position is set past scanned input
        return self._stream

    def _next_nonblank(self):
        # we directly call read() rather than getc() because our caller is
        # prepared to handle empty string, meaning EOF
        # (YES we want the walrus operator)
        c = self._stream.read(1)
        while c.isspace():
            c = self._stream.read(1)
        return c

    def _getc(self, num=1, full=True):
        got = self._stream.read(num)
        if full and len(got) < num:
            self._error("Trying to read past end of stream")
        return got

    def _putback(self, cc):
        # if this test fails, it's not a user error, it's a coding error
        assert self._stream.tell() >= len(cc)
        self._stream.seek(-len(cc), io.SEEK_CUR)

    def _error(self, message, offset=0):
        oldpos = self._stream.tell()
        # 'offset' is relative to current pos
        self._stream.seek(offset, io.SEEK_CUR)
        raise LLSDParseError("%s at byte %d: %r" %
                             (message, oldpos+offset, self._getc(1, full=False)))

    # map char following escape char to corresponding character
    _escaped = {
        ord(b'a'): ord(b'\a'),
        ord(b'b'): ord(b'\b'),
        ord(b'f'): ord(b'\f'),
        ord(b'n'): ord(b'\n'),
        ord(b'r'): ord(b'\r'),
        ord(b't'): ord(b'\t'),
        ord(b'v'): ord(b'\v'),
    }

    def _parse_string_delim(self, delim):
        "Parse a delimited string."
        insert_idx = 0
        delim_ord = ord(delim)
        # Preallocate a working buffer for the decoded string output
        # to avoid allocs in the hot loop.
        decode_buff = self._decode_buff
        # Cache this in locals, otherwise we have to perform a lookup on
        # `self` in the hot loop.
        getc = self._getc
        cc = 0
        while True:
            try:
                cc = ord(getc())

                if cc == _BACKSLASH_ORD:
                    # Backslash, figure out if this is an \xNN hex escape or
                    # something like \t
                    cc = ord(getc())
                    if cc == _X_ORD:
                        # It's a hex escape. char is the value of the two
                        # following hex nybbles. This slice may result in
                        # a short read (0 or 1 bytes), but either a
                        # `ValueError` will be triggered by the first case,
                        # and the second will cause an `IndexError` on the
                        # next iteration of the loop.
                        hex_bytes = getc(2)
                        try:
                            # int() can parse a `bytes` containing hex,
                            # no explicit `bytes.decode("ascii")` required.
                            cc = int(hex_bytes, 16)
                        except ValueError as e:
                            # One of the hex characters was likely invalid.
                            # Wrap the ValueError so that we can provide a
                            # byte offset in the error.
                            self._error(e, offset=-2)
                    else:
                        # escape char preceding anything other than the chars
                        # in _escaped just results in that same char without
                        # the escape char
                        cc = self._escaped.get(cc, cc)
                elif cc == delim_ord:
                    break
            except IndexError:
                # We can be reasonably sure that any IndexErrors inside here
                # were caused by an out-of-bounds `buff[read_idx]`.
                self._error("Trying to read past end of buffer")

            try:
                decode_buff[insert_idx] = cc
            except IndexError:
                # Oops, that overflowed the decoding buffer, make a
                # new expanded buffer containing the existing contents.
                decode_buff = bytearray(decode_buff)
                decode_buff.extend(b"\x00" * _DECODE_BUFF_ALLOC_SIZE)
                decode_buff[insert_idx] = cc

            insert_idx += 1

        # Sync our local read index with the canonical one
        try:
            # Slice off only what we used of the working decode buffer
            return decode_buff[:insert_idx].decode('utf-8')
        except UnicodeDecodeError as exc:
            self._error(exc)