2.lua 中string的实现

本文主要是介绍2.lua 中string的实现，对大家解决编程问题具有一定的参考价值，需要的程序猿们随着小编来一起学习吧！

数据结构定义

#define CommonHeader	struct GCObject *next; lu_byte tt; lu_byte marked

typedef struct GCObject {
  CommonHeader;
} GCObject;

typedef union Value {
  struct GCObject *gc;    /* collectable objects */
  void *p;         /* light userdata */
  lua_CFunction f; /* light C functions */
  lua_Integer i;   /* integer numbers */
  lua_Number n;    /* float numbers */
} Value;

typedef struct TValue {
  TValuefields;
} TValue;

//字符串
typedef struct TString {
  CommonHeader;      //gc
  lu_byte extra;     //长字符串是否hash过的标志
  lu_byte shrlen;    //短字符串的长度
  unsigned int hash; //hash值
  union {
    size_t lnglen;   //长字符串的长度
    struct TString *hnext;  //短字符串的next指针
  } u; //在长字符串表示长度,短字符串时next指针
  char contents[1]; //文本内容,具体大小看字符串
} TString;

CommonHeader这个宏是字符串的第一个成员，这么做的原因是与GCObject结构体的内存一致，暂时不理会lua的gc操作。

lua字符串的设计

lua虚拟机有一个全局hash表去存储短字符串(常字符串不存储在hash表)
字符串只读，在lua语言中所有对字符串进行修改都会重新生成字符串

a = "1"
a = "2"

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-la2ECNNb-1623548821251)(./1.png)]

为什么要设计字符串只读?
可以从以下两点去分析

字符串的比较和查找:传统字符串的比较是怎么样的？先比较其长度，如果长度相等进入循环逐个字符去比较。那么设计只读字符串之后呢？所有字符串在内存中独一份，不同字符串的地址是不同的，因此想要判断字符串是否一样只要比较地址是否相同就好了。字符串放入hash表中，可以根据hash值找到对应的hash槽位进行遍历查找。所以这样设计可以提高比较和查找的效率

空间节省:在lua中变量保存相同的字符串指向的都是同一个内存，可以省去创建字符串副本的空间

长短字符串

#define LUA_TSTRING		4
//长短宏定义
#define LUA_VSHRSTR	makevariant(LUA_TSTRING, 0)  //短字符串
#define LUA_VLNGSTR	makevariant(LUA_TSTRING, 1)  //长字符串
//类型判断
#define ttisstring(o)		checktype((o), LUA_TSTRING)
#define ttisshrstring(o)	checktag((o), ctb(LUA_VSHRSTR))
#define ttislngstring(o)	checktag((o), ctb(LUA_VLNGSTR))
//长短界限
#define LUAI_MAXSHORTLEN	40
//获取值
#define getstr(ts)  ((ts)->contents)

lua中字符串类型是LUA_TSTRING ,如果根据这个值又分出了LUA_VSHRSTR和LUA_VLNGSTR，和之前的number类似。

区别长短字符串的界限在于LUAI_MAXSHORTLEN这个宏,大于这个是长字符串，小于等于是短字符串

字符串的创建

存储位置

#define STRCACHE_N		53
#define STRCACHE_M		2

typedef struct stringtable {
  TString **hash; //指针数组
  int nuse;  //hash槽的大小
  int size;  //字符串存储的数量
} stringtable;

typedef struct global_State {
  ...
  stringtable strt;  //字符串hash表
  ...
  TString *strcache[STRCACHE_N][STRCACHE_M];  //字符串缓存
  ...
} global_State;

strt是字符串最终存储的地方
strcache做了一下简单的缓存

初始化hash表

LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
  ...
  
  //清空hash表
  g->strt.size = g->strt.nuse = 0;
  g->strt.hash = NULL;
  
  ...
}

#define MINSTRTABSIZE	128

void luaS_init (lua_State *L) {
  global_State *g = G(L);
  int i, j;
  stringtable *tb = &G(L)->strt;
  //创建大小为MINSTRTABSIZE的hash槽
  tb->hash = luaM_newvector(L, MINSTRTABSIZE, TString*);
  tablerehash(tb->hash, 0, MINSTRTABSIZE);  //初始化hash槽
  tb->size = MINSTRTABSIZE;
  
  //初始化缓存字符串
  g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
  luaC_fix(L, obj2gco(g->memerrmsg));  /* it should never be collected */
  for (i = 0; i < STRCACHE_N; i++)  /* fill cache with valid strings */
    for (j = 0; j < STRCACHE_M; j++)
      g->strcache[i][j] = g->memerrmsg;
}

缓存

#define point2uint(p)	((unsigned int)((size_t)(p) & UINT_MAX))

TString *luaS_new (lua_State *L, const char *str) {
  unsigned int i = point2uint(str) % STRCACHE_N;  //以地址为hash
  int j;
  TString **p = G(L)->strcache[i];		//根据hash值找到对应的槽位
  for (j = 0; j < STRCACHE_M; j++) {    //查找缓存字符串
    if (strcmp(str, getstr(p[j])) == 0) //如果有缓存，说明字符串已经创建过了
      return p[j];
  }
  
  for (j = STRCACHE_M - 1; j > 0; j--)  //移位出一个缓存位置,可能会清空上一次的缓存
    p[j] = p[j - 1];
  //创建新的字符串
  p[0] = luaS_newlstr(L, str, strlen(str));
  return p[0];
}

可以看到代码以地址为hash值做了一个简单的缓存机制。
参数1:主要用于寻找存储位置(hash表)
参数2:创建字符串

创建字符串入口

TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
  if (l <= LUAI_MAXSHORTLEN)
    return internshrstr(L, str, l); //短字符串
  else {
  	//长字符串
    TString *ts;
    if (l_unlikely(l >= (MAX_SIZE - sizeof(TString))/sizeof(char)))
      luaM_toobig(L);
    ts = luaS_createlngstrobj(L, l);
    memcpy(getstr(ts), str, l * sizeof(char));
    return ts;
  }
}

参数1:寻找存储位置(hash表)
参数2:字符串
参数3:字符串长度

可以看到通过长度l与LUAI_MAXSHORTLEN去比大小来分辨长短字符串。

短字符串的创建

//hash函数
unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
  unsigned int h = seed ^ cast_uint(l);
  for (; l > 0; l--)
    h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
  return h;
}

static TString *internshrstr (lua_State *L, const char *str, size_t l) {
  TString *ts;
  global_State *g = G(L);
  stringtable *tb = &g->strt;	//hash表
  unsigned int h = luaS_hash(str, l, g->seed);//计算hash值
  TString **list = &tb->hash[lmod(h, tb->size)];//根据字符串的hash值找到对应的槽
  lua_assert(str != NULL);  /* otherwise 'memcmp'/'memcpy' are undefined */
  //查重操作，如果有表示之前创建过这个字符串,直接返回该字符串
  for (ts = *list; ts != NULL; ts = ts->u.hnext) {
    if (l == ts->shrlen && (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
      /* found! */
      if (isdead(g, ts))  /* dead (but not collected yet)? */
        changewhite(ts);  /* resurrect it */
      return ts;
    }
  }
  //字符串的个数和hash表槽的大小，判定是否进行rehash
  if (tb->nuse >= tb->size) {  /* need to grow string table? */
    growstrtab(L, tb);
    list = &tb->hash[lmod(h, tb->size)];  //rehash完当前字符串新的槽位
  }
  ts = createstrobj(L, l, LUA_VSHRSTR, h);
  //字符串赋值
  memcpy(getstr(ts), str, l * sizeof(char));
  ts->shrlen = cast_byte(l);//字符串大小
  //添加链表(头插)
  ts->u.hnext = *list;
  *list = ts;
  tb->nuse++;//字符串个数+1
  return ts;
}

这个是hash表插入的常规操作

算hash值
根据hash值找到槽位
遍历当前槽位去查重，有重复直接返回
最后就是赋值，插入hash表

在lua字符串中解决hash冲突用的是链表

短字符串的rehash

static void tablerehash (TString **vect, int osize, int nsize) {
  int i;
  for (i = osize; i < nsize; i++)  //初始化新的槽位
    vect[i] = NULL;
  for (i = 0; i < osize; i++) {    //遍历老的槽位进行rehash
    TString *p = vect[i];		  //保存当前槽
    vect[i] = NULL;                //清空当前槽
    while (p) {  //遍历当前槽
      TString *hnext = p->u.hnext; //保存下一个节点位置
	  
      unsigned int h = lmod(p->hash, nsize);  //重新计算hash槽位
	  //插入链表(头插)
      p->u.hnext = vect[h];
      vect[h] = p;
	  
      p = hnext;				  //之前保存的下个节点的位置
    }
  }
}

void luaS_resize (lua_State *L, int nsize) {
  stringtable *tb = &G(L)->strt; //hash表
  int osize = tb->size; //oldsize
  TString **newvect;
  if (nsize < osize)  //缩容
    tablerehash(tb->hash, osize, nsize);
  //realloc
  newvect = luaM_reallocvector(L, tb->hash, osize, nsize, TString*);
  if (l_unlikely(newvect == NULL)) {  //realloc 失败
    if (nsize < osize)
      tablerehash(tb->hash, nsize, osize);//如果之前缩容过,恢复之前的hash表
  }
  else {
    //更新hash表
    tb->hash = newvect;
    tb->size = nsize;
	
    if (nsize > osize) //扩容
      tablerehash(newvect, osize, nsize);
  }
}

代码也相对简单,通过oldsize和newsize去判断扩容和缩容，关键函数在于tablerehash，操作也是比较简单常规

遍历hash表
根据hash值重新得到新的槽位,然后插入

长字符串的创建

TString *luaS_createlngstrobj (lua_State *L, size_t l) {
  TString *ts = createstrobj(L, l, LUA_VLNGSTR, G(L)->seed);
  ts->u.lnglen = l;
  return ts;
}

TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
  if (l <= LUAI_MAXSHORTLEN)  /* short string? */
    return internshrstr(L, str, l);
  else {
    TString *ts;
    if (l_unlikely(l >= (MAX_SIZE - sizeof(TString))/sizeof(char)))
      luaM_toobig(L);
    ts = luaS_createlngstrobj(L, l);
	//赋值
    memcpy(getstr(ts), str, l * sizeof(char));
    return ts;
  }
}

长字符串的创建没有那么复杂，创建完后赋值就完成创建了

字符串的删除

void luaS_remove (lua_State *L, TString *ts) {
  stringtable *tb = &G(L)->strt;//hash表
  TString **p = &tb->hash[lmod(ts->hash, tb->size)];//该字符串对应的槽位
  while (*p != ts)  //查找ts
    p = &(*p)->u.hnext;
  *p = (*p)->u.hnext;	//让前一个节点的next指针指向下一个节点
  tb->nuse--;
}

长字符串没有删除

遍历对应的槽位去查找要删除的字符串
让前一个节点的next指针指向下一个节点

字符串的比较

//短字符串的比较
#define eqshrstr(a,b)	check_exp((a)->tt == LUA_VSHRSTR, (a) == (b))
//长字符串的比较
int luaS_eqlngstr (TString *a, TString *b) {
  size_t len = a->u.lnglen;
  lua_assert(a->tt == LUA_VLNGSTR && b->tt == LUA_VLNGSTR);
  return (a == b) ||  /* same instance or... */
    ((len == b->u.lnglen) &&  /* equal length and ... */
     (memcmp(getstr(a), getstr(b), len) == 0));  /* equal contents */
}

短字符串的比较直接比较地址
长字符串的比较优先比较地址，然后是常规的字符串比较

参考

<lua设计与实现> codedump

基于lua5.4.3源码

这篇关于2.lua 中string的实现的文章就介绍到这儿，希望我们推荐的文章对大家有所帮助，也希望大家多多支持为之网！

Java教程

2.lua 中string的实现

数据结构定义

lua字符串的设计

长短字符串

字符串的创建

存储位置

初始化hash表

缓存

创建字符串入口

短字符串的创建

短字符串的rehash

长字符串的创建

字符串的删除

字符串的比较

参考

前端开发

后端开发

移动端开发

数据库

服务器运维

人工智能

区块链

游戏开发

网站运营

大数据/云计算

软件工程

软件/开发工具使用

资讯