Bootstrap

Python源码剖析(四)字符串对象

bilibili视频讲解:https://space.bilibili.com/431392724
b站用户名:平凡的久月

1. PyBytesObject

变长对象(数据长度在定义时是不知道的,只能在创建时才能确定)

不可变对象(改变值内存地址会发生改变)

1.1 定义

// Include/bytesobject.h
#ifndef Py_LIMITED_API
typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     */
} PyBytesObject;
#endif

// python3 中不再使用 PyBytesObject 作为 String 类的底层实现
#define PyObject_VAR_HEAD  PyVarObject ob_base;

typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;

typedef struct _object {
    _PyObject_HEAD_EXTRA
    Py_ssize_t ob_refcnt;
    struct _typeobject *ob_type;
} PyObject;

// 等价于下列表达
typedef struct {
    Py_hash_t ob_shash;   // 缓存该对象的hash值,避免重新计算(初始值-1),dict中详细解释作用
    // PyBytesObject内部维护的字符串必须以'\0'结尾
    char ob_sval[1];      // 字符指针,指向ob_size+1个字节的内存'\0'
    Py_ssize_t ob_size;
    Py_ssize_t ob_refcnt;
    struct _typeobject *ob_type;
} PyBytesObject;
#endif
PyTypeObject PyBytes_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "bytes",
    PyBytesObject_SIZE,  // ob_size
    sizeof(char),        // 一个字节
    bytes_dealloc,                      /* tp_dealloc */
    0,                                          /* tp_print */
    0,                                          /* tp_getattr */
    0,                                          /* tp_setattr */
    0,                                          /* tp_reserved */
    (reprfunc)bytes_repr,                       /* tp_repr */
    // 支持三种操作
    &bytes_as_number,                           /* tp_as_number */
    &bytes_as_sequence,                         /* tp_as_sequence */
    &bytes_as_mapping,                          /* tp_as_mapping */
    (hashfunc)bytes_hash,                       /* tp_hash */
    // ......
    0,                                          /* tp_init */
    0,                                          /* tp_alloc */
    bytes_new,                                  /* tp_new */
    PyObject_Del,                               /* tp_free */
};

1.2 创建PyBytesObject

Python提供了多种路径从C中原生的字符串创建PyBytesObject对象

  • PyBytes_FromString

    PyObject *
    PyBytes_FromString(const char *str)
    {
        size_t size;
        PyBytesObject *op;
    
        assert(str != NULL);
        size = strlen(str);
        // 判断字符串长度是否超过系统寻址能力
        if (size > PY_SSIZE_T_MAX - PyBytesObject_SIZE) {
            PyErr_SetString(PyExc_OverflowError,
                "byte string is too long");
            return NULL;
        }
        
        // 处理空串:通过nullstring,始终只有一个
        if (size == 0 && (op = nullstring) != NULL) {
    #ifdef COUNT_ALLOCS
            null_strings++;
    #endif
            Py_INCREF(op);
            return (PyObject *)op;
        }
        
        // 处理单个字符(字符串对象缓冲池)
        if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
    #ifdef COUNT_ALLOCS
            one_strings++;
    #endif
            Py_INCREF(op);
            return (PyObject *)op;
        }
    
        // 创建新的PyBytesObject对象,并初始化
        /* Inline PyObject_NewVar */
        op = (PyBytesObject *)PyObject_MALLOC(PyBytesObject_SIZE + size);
        if (op == NULL)
            return PyErr_NoMemory();
        (void)PyObject_INIT_VAR(op, &PyBytes_Type, size);
        op->ob_shash = -1;
        memcpy(op->ob_sval, str, size+1);
        /* share short strings */
        if (size == 0) {
            nullstring = op;
            Py_INCREF(op);
        } else if (size == 1) {
            characters[*str & UCHAR_MAX] = op;
            Py_INCREF(op);
        }
        return (PyObject *) op;
    }
    

在这里插入图片描述

python3中没有ob_sstate这个变量!!!

  • PyBytes_FromStringAndSize

    PyObject *
    PyBytes_FromStringAndSize(const char *str, Py_ssize_t size)
    {
        PyBytesObject *op;
        if (size < 0) {
            PyErr_SetString(PyExc_SystemError,
                "Negative size passed to PyBytes_FromStringAndSize");
            return NULL;
        }
        
        // 单个字符
        if (size == 1 && str != NULL &&
            (op = characters[*str & UCHAR_MAX]) != NULL)
        {
    #ifdef COUNT_ALLOCS
            one_strings++;
    #endif
            Py_INCREF(op);
            return (PyObject *)op;
        }
    
        // 创建新的PyBytesObject对象,并初始化
        op = (PyBytesObject *)_PyBytes_FromSize(size, 0);
        if (op == NULL)
            return NULL;
        if (str == NULL)
            return (PyObject *) op;
    
        memcpy(op->ob_sval, str, size);
        /* share short strings */
        if (size == 1) {
            characters[*str & UCHAR_MAX] = op;
            Py_INCREF(op);
        }
        return (PyObject *) op;
    }
    

1.3 intern机制

1.3.1 Python2中

Python2中通过PyString_InternInPlace实现intern机制

检查两项内容:

(1)是否是PyBytesObject?

(2)是否被intern机制处理过(保证只处理一次)

void
PyString_InternInPlace(PyObject **p)
{
    register PyStringObject *s = (PyStringObject *)(*p);
    PyObject *t;
    if (s == NULL || !PyString_Check(s))
        Py_FatalError("PyString_InternInPlace: strings only please!");
    /* If it's a string subclass, we don't really know what putting
       it in the interned dict might do. */
    if (!PyString_CheckExact(s))
        return;
    if (PyString_CHECK_INTERNED(s))
        return;
    if (interned == NULL) {
        interned = PyDict_New();
        if (interned == NULL) {
            PyErr_Clear(); /* Don't leave an exception */
            return;
        }
    }
    t = PyDict_GetItem(interned, (PyObject *)s);
    if (t) {
        Py_INCREF(t);
        Py_SETREF(*p, t);
        return;
    }

    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
        PyErr_Clear();
        return;
    }
    /* The two references in interned are not counted by refcnt.
       The string deallocator will take care of this */
    Py_REFCNT(s) -= 2;
    PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
}
  • intern是什么?PyDictObject对象–interned

  • 创建临时变量a,在intern中寻找是否有一样的对象。

在这里插入图片描述

  • intern中的指针不作为a的有效引用(不然a永远无法销毁)

细节问题:

(1)为什么创建的时候interned的键与值都设置为对象的PyObject指针?

(2)为什么将对象的引用计数减2?

1.3.2 Python3中

Python3中做了修改,移动到了sys库,编译器默认执行

from six.moves import intern
from sys import intern
str = "shanghai"
print(intern(str).__doc__)
// Modules/pyexpat.c
static PyObject*
string_intern(xmlparseobject *self, const char* str)
{
    PyObject *result = conv_string_to_unicode(str);
    PyObject *value;
    /* result can be NULL if the unicode conversion failed. */
    if (!result)
        return result;
    if (!self->intern)
        return result;
    value = PyDict_GetItem(self->intern, result);
    if (!value) {
        if (PyDict_SetItem(self->intern, result, result) == 0)
            return result;
        else
            return NULL;
    }
    Py_INCREF(value);
    Py_DECREF(result);
    return value;
}

1.4 字符串缓冲池

一个字节的字符对应的对象缓冲池

static PyBytesObject *characters[UCHAR_MAX + 1];
static PyBytesObject *nullstring;

实现过程

(1)创建PyBytesObject对象

(2)进行intern操作

(3)缓存进缓冲池

1.5 与效率相关的问题

背景:实现100个字符串的拼接

实现方法:+

问题:创建N-1个对象,进行N-1次内存的申请与释放

根本原因:不可变对象

解决方法:对存储在list的一组对象进行连接操作(join)

一次申请N个对象使用的内存,并统计这些对象维护的字符串有多长,然后申请内存,最后拷贝到内存空间。

a = 345
b = a
c = 456
d = 456
print(a is b)
print(a is c)
print(c is d)


e = "abc"
f = "abc"
g = "abd"
print(e is f)
print(e is g)

# True
# False
# True
# True
# False
;