具体思路:
1->敏感词库,可从数据库读取,也可以从文件加载.
2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.
3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.
4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.
代码.h
1 #ifndef SENSITIVE_WORDS_CHECKER_
2 #define SENSITIVE_WORDS_CHECKER_
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <memory.h>
6 #include <map>
7 #include <vector>
8
9 enum {
10 enmMaxWordLength = 32, //每个敏感词最大长度
11 enmMaxWordsFileLength = 1024 * 128, //敏感词文件最大长度128k
12 enmMaxContentLength = 1024, // 单次检测内容测最大长度
13 };
14
15 struct SensitiveWord
16 {
17 char szWord[enmMaxWordLength];
18 SensitiveWord()
19 {
20 memset(szWord, 0, enmMaxWordLength);
21 }
22 };
23
24 typedef std::vector<SensitiveWord*> WordList;
25 typedef std::map<uint32_t, WordList*> WordMap;
26
27 class SensitiveWordsChecker
28 {
29 public:
30 SensitiveWordsChecker() :arrSensitiveWord(NULL), nSensitiveWordCnt(0){}
31 ~SensitiveWordsChecker(){ delete[] arrSensitiveWord; }
32 public:
33 void LoadWordsFromUTF8File(const char *file_name);
34 void LoadWordsFromGBKFile(const char *file_name);
35 protected:
36 int32_t WriteToFile(const char buf[], const int32_t buf_size, const char *file_name);
37 void DumpWordMap();
38 void GenTestData();
39 void Test();
40 void StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...);
41 private:
42 int32_t LoadFile(char buf[], const uint32_t buf_size, const char *file_name);
43 int32_t CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen);
44 int32_t UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
45 int32_t GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
46 uint32_t GetWordsCount(char buf[],const uint32_t buf_size,char separator);
47 char *StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list);
48 int32_t GetWords(char gbk_buf[], const uint32_t buf_size, char separator);
49 void BuildWordMap();
50 uint32_t GetFirstCharFromGBK(char gbk_buf[]);
51 uint32_t GetFirstCharFromTUF8(char utf8_buf[]);
52 uint32_t GetFirstChar(char buf[]);
53 // 返回 0 表示in_utf8_buf里面没有敏感词
54 // 返回 1 表示in_utf8_buf里面含有关键词,并将关键词替换为*输出到out_utf8_buf
55 int32_t CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]);
56 const SensitiveWord* FindSensitiveWord(uint32_t code,const char *pos);
57 private:
58 SensitiveWord *arrSensitiveWord;
59 uint32_t nSensitiveWordCnt;
60 WordMap mapWords;
61 };
62
63 #endif
View Code
.cpp
1 #include "SenditiveWordsChecker.h"
2 #include "stdio.h"
3 #include "string.h"
4 #include "iconv.h"
5 #include <stdarg.h>
6 #include <new>
7
8 void SensitiveWordsChecker::LoadWordsFromUTF8File(const char *file_name)
9 {
10 char utf8_buf[enmMaxWordsFileLength] , gbk_buf[enmMaxWordsFileLength];
11 LoadFile(utf8_buf, enmMaxWordsFileLength, file_name);
12 UTF8_To_GBK(utf8_buf, strlen(utf8_buf), gbk_buf, enmMaxWordsFileLength);
13 GetWords(gbk_buf, enmMaxWordsFileLength, ',');
14 }
15
16 void SensitiveWordsChecker::LoadWordsFromGBKFile(const char *file_name)
17 {
18 char gbk_buf[enmMaxWordsFileLength];
19 LoadFile(gbk_buf, enmMaxWordsFileLength, file_name);
20 GetWords(gbk_buf, enmMaxWordsFileLength,',');
21 }
22
23 int32_t SensitiveWordsChecker::LoadFile(char buf[], const uint32_t buf_size, const char *file_name)
24 {
25 FILE * pFile;
26 size_t lSize = 0, result = 0;
27 fopen_s(&pFile, file_name, "rb");
28 if (pFile == NULL) { fputs("File errorn", stderr); return -1; }
29 // obtain file size:
30 fseek(pFile, 0, SEEK_END);
31 lSize = ftell(pFile);
32 rewind(pFile);
33 if (lSize >= buf_size){ fputs("file too largen", stderr); return -1; }
34 result = fread(buf, 1, lSize, pFile);
35 if (result != lSize) { fputs("Reading errorn", stderr); return -1; }
36 buf[lSize] = ' ';
37 return fclose(pFile);
38 }
39
40 int32_t SensitiveWordsChecker::CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen)
41 {
42 iconv_t cd;
43 char **pin = &inbuf;
44 char **pout = &outbuf;
45
46 cd = iconv_open(to_charset, from_charset);
47 if (cd == 0)
48 return -1;
49 memset(outbuf, 0, outlen);
50 if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
51 return -1;
52 iconv_close(cd);
53 *pout = ' ';
54 return 0;
55 }
56
57 int32_t SensitiveWordsChecker::UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
58 {
59 return CodeConvert("utf-8", "gbk", inbuf, inlen, outbuf, outlen);
60 }
61
62 int32_t SensitiveWordsChecker::GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
63 {
64 return CodeConvert("gbk", "utf-8", inbuf, inlen, outbuf, outlen);
65 }
66
67 uint32_t SensitiveWordsChecker::GetWordsCount(char buf[], const uint32_t buf_size, char separator)
68 {
69 const char *p = buf - 1;
70 uint32_t i = 0;
71 while ((p = strchr(p + 1, separator)) != NULL)
72 {
73 ++i;
74 }
75 return i;
76 }
77
78 int32_t SensitiveWordsChecker::WriteToFile(const char buf[], const int32_t buf_size, const char *file_name)
79 {
80 FILE * pFile;
81 size_t result;
82 fopen_s(&pFile, file_name, "wb");
83 if (pFile == NULL) { fputs("File errorn", stderr); return -1; }
84 result = fwrite(buf, 1, buf_size, pFile);
85 if (result != buf_size) { fputs("Writing errorn", stderr); return -1; }
86 return fclose(pFile);
87 }
88
89 int32_t SensitiveWordsChecker::GetWords(char gbk_buf[], const uint32_t buf_size, char separator)
90 {
91 char buf[enmMaxWordsFileLength];
92 StrcpyExcludeChar(buf, enmMaxWordsFileLength, gbk_buf, "n"); //排除换行符
93 uint32_t nWordsCount = GetWordsCount(buf, buf_size,',');
94 printf("words_count=%dn", nWordsCount);
95 arrSensitiveWord = new SensitiveWord[nWordsCount];
96 if (arrSensitiveWord == NULL){return -1;}
97 nSensitiveWordCnt = 0;
98 const char *p = NULL,*q = buf;
99 while ((p = strchr(q, separator)) != NULL)
100 {
101 memcpy(arrSensitiveWord[nSensitiveWordCnt].szWord, q, p - q);
102 //printf("%sn", arrSensitiveWord[nSensitiveWordCnt].szWord);
103 q = p + 1;
104 ++nSensitiveWordCnt;
105 }
106 BuildWordMap();
107 return 0;
108 }
109
110 char * SensitiveWordsChecker::StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list)
111 {
112 uint32_t i = 0, j = 0, flag = 0;
113 const char *p = NULL;
114 if (dst == NULL && src == NULL)return NULL;
115 if (dst == src)return dst;
116 for (; j < dst_len && src[i] != ' '; ++i)
117 {
118 flag = 0;
119 p = exclude_list;
120 while (p && *p != ' ')
121 {
122 if (*p == src[i]){ flag = 1; break; }
123 p++;
124 }
125 if (flag == 0)dst[j++] = src[i];
126 }
127 dst[j] = ' ';
128 return dst;
129 }
130
131 uint32_t SensitiveWordsChecker::GetFirstCharFromGBK(char gbk_buf[])
132 {
133 int32_t code = 0;
134 int32_t len = strlen(gbk_buf);
135 if (len == 0)return 0;
136 if (gbk_buf[0] >= 0 || len == 1)
137 {
138 //printf("%cn", gbk_buf[0]);
139 return uint32_t(gbk_buf[0]); //ASCII 字符
140 }
141 else
142 {
143 short high = (short)gbk_buf[0] + 256;
144 short low = (short)gbk_buf[1] + 256;
145 code = high * 256 + low;
146 char cstr[3];
147 cstr[0] = gbk_buf[0]; // GBK严格按照两个字节表示一个中文字符
148 cstr[1] = gbk_buf[1];
149 cstr[2] = 0;
150 //printf("%s %xn", cstr, code);
151 return code;
152 }
153 }
154
155 uint32_t SensitiveWordsChecker::GetFirstCharFromTUF8(char utf8_buf[])
156 {
157 uint32_t code = 0;
158 int32_t len = strlen(utf8_buf);
159 if (len == 0)return 0;
160 if (utf8_buf[0] >= 0 || len == 1)
161 {
162 printf("%cn", utf8_buf[0]);
163 return int32_t(utf8_buf[0]); //ASCII 字符
164 }
165 else
166 {
167 short high = (short)utf8_buf[0];
168 short mid = (short)utf8_buf[1];
169 short low = (short)utf8_buf[2];
170 code = high * 256 * 256 + mid * 256 + low;
171 char cstr[4];
172 cstr[0] = utf8_buf[0]; // UTF8大多数情况下三个字节表示一个中文字符
173 cstr[1] = utf8_buf[1];
174 cstr[2] = utf8_buf[2];
175 cstr[3] = 0;
176 printf("%sn", cstr);
177 return code;
178 }
179 }
180
181 uint32_t SensitiveWordsChecker::GetFirstChar(char buf[])
182 {
183 uint32_t code = 0;
184 int32_t len = strlen(buf);
185 if (len == 0)return 0;
186 return (uint32_t)buf[0];
187 }
188
189 void SensitiveWordsChecker::BuildWordMap()
190 {
191 WordList *wordList = NULL;
192 for (uint32_t i = 0; i < nSensitiveWordCnt; ++i)
193 {
194 uint32_t code = GetFirstCharFromGBK(arrSensitiveWord[i].szWord);
195 WordMap::iterator it = mapWords.find(code);
196 if (it == mapWords.end())
197 {
198 wordList = new WordList();
199 mapWords[code] = wordList;
200 }
201 else
202 {
203 wordList = it->second;
204 }
205 wordList->push_back(&arrSensitiveWord[i]);
206 }
207 DumpWordMap();
208 GenTestData();
209 Test();
210 }
211
212 void SensitiveWordsChecker::DumpWordMap()
213 {
214 uint32_t word_cnt = 0,i = 0;
215 WordMap::const_iterator it = mapWords.begin();
216 for (; it != mapWords.end(); ++it)
217 {
218 //printf("%u : %un", i++, it->second->size());
219 word_cnt += it->second->size();
220 }
221 printf("word_cnt = %un", word_cnt);
222 }
223
224 int32_t SensitiveWordsChecker::CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[])
225 {
226 // 先把被检测字符串转换为GBK编码
227 char gbk_buf[enmMaxContentLength],out_gbk_buf[enmMaxContentLength];
228 UTF8_To_GBK(in_utf8_buf, strlen(in_utf8_buf), gbk_buf, enmMaxContentLength);
229 // 提取GBK字串里面的每一个字符,去map里面查找以该字符为首的关键词列表
230 int32_t gbk_buf_len = strlen(gbk_buf);
231 uint32_t code = 0, flag = 0, out_gbk_buf_len = 0;
232 char c = 0, cstr[3] = { 0 };
233 for (int32_t i = 0; i < gbk_buf_len;)
234 {
235 flag = 0;
236 if (gbk_buf[i] >= 0 || i == gbk_buf_len - 1)
237 {
238 c = gbk_buf[i];
239 //printf("%cn", c); //ASCII字符
240 code = (uint32_t)c;
241 flag = 1;
242 out_gbk_buf[out_gbk_buf_len] = c;
243 }
244 else
245 {
246 flag = 2;
247 short high = (short)gbk_buf[i] + 256;
248 short low = (short)gbk_buf[i + 1] + 256;
249 code = high * 256 + low;
250
251 cstr[0] = gbk_buf[i];
252 cstr[1] = gbk_buf[i + 1];
253 cstr[2] = 0;
254
255 out_gbk_buf[out_gbk_buf_len] = cstr[0];
256 out_gbk_buf[out_gbk_buf_len + 1] = cstr[1];
257 //printf("%sn", cstr);
258 }
259 // 检查敏感词
260 const SensitiveWord *sensitiveWord = FindSensitiveWord(code, &gbk_buf[i]);
261 int32_t word_len = 0;
262 if (NULL != sensitiveWord)
263 {
264 flag = 0;
265 //printf("%sn", sensitiveWord->szWord);
266 word_len = strlen(sensitiveWord->szWord);
267 memset(&out_gbk_buf[out_gbk_buf_len],'*', word_len);
268 }
269 int32_t step = word_len + flag;
270 i += step;
271 out_gbk_buf_len += step;
272 }
273 out_gbk_buf[out_gbk_buf_len] = ' ';
274 //printf("out_gbk_buf = %sn", out_gbk_buf);
275 GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxContentLength);
276 return 0;
277 }
278
279 const SensitiveWord* SensitiveWordsChecker::FindSensitiveWord(uint32_t code, const char *pos)
280 {
281 int32_t word_len = 0;
282 WordMap::const_iterator it = mapWords.find(code);
283 if (it == mapWords.end()){ return NULL; }
284 WordList *wordList = it->second;
285 for (uint32_t i = 0; i < wordList->size(); i++)
286 {
287 const SensitiveWord *sensitiveWord = (*wordList)[i];
288 word_len = strlen(sensitiveWord->szWord);
289 // 如果内容一样,就说明是敏感词
290 if (memcmp(sensitiveWord->szWord, pos, word_len) == 0)
291 {
292 return sensitiveWord;
293 }
294 }
295 return NULL;
296 }
297
298 void SensitiveWordsChecker::GenTestData()
299 {
300 char in_gbk_buf[enmMaxWordsFileLength], out_gbk_buf[enmMaxWordsFileLength];
301 LoadFile(in_gbk_buf, enmMaxWordsFileLength, "poem.txt");
302 int32_t len = strlen(in_gbk_buf);
303 uint32_t n = 0;
304 for (int32_t i = 0; i < len && n < enmMaxWordsFileLength;++i)
305 {
306 if (i % 4 == 0 && short(in_gbk_buf[i]) > 0)
307 {
308 int32_t nRandIndex = rand() % nSensitiveWordCnt;
309 SensitiveWord sensitiveWord = arrSensitiveWord[nRandIndex];
310 int32_t word_len = strlen(sensitiveWord.szWord);
311 for (int32_t j = 0; j < word_len && n < enmMaxWordsFileLength; ++j)
312 {
313 out_gbk_buf[n++] = sensitiveWord.szWord[j];
314 }
315 }
316 out_gbk_buf[n++] = in_gbk_buf[i];
317 }
318 out_gbk_buf[n] = ' ';
319 char out_utf8_buf[enmMaxWordsFileLength];
320 GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxWordsFileLength);
321 WriteToFile(out_utf8_buf, strlen(out_utf8_buf), "test_data.txt");
322 }
323
324 void SensitiveWordsChecker::Test()
325 {
326 const int32_t max_line_len = 1024;
327 char utf8_buf[enmMaxWordsFileLength];
328 char out_utf8_buf[enmMaxWordsFileLength];
329 LoadFile(utf8_buf, enmMaxWordsFileLength, "test_data.txt");
330 const char *p = NULL, *q = utf8_buf;
331 uint32_t offset = 0;
332 while ((p = strchr(q, 'n')) != NULL)
333 {
334 char in_uft8_line[max_line_len] = { 0 };
335 char out_uft8_line[max_line_len] = { 0 };
336 char out_gbk_line[max_line_len] = { 0 };
337 memcpy(in_uft8_line, q, p - q);
338 UTF8_To_GBK(in_uft8_line, strlen(in_uft8_line), out_gbk_line, max_line_len);
339 printf("%sn", out_gbk_line);
340 CheckSensitiveWord(out_uft8_line, in_uft8_line);
341 q = p + 1;
342 char gbk[enmMaxContentLength];
343 UTF8_To_GBK(out_uft8_line, strlen(out_uft8_line), gbk, enmMaxContentLength);
344 printf("%sn", gbk);
345 StrAppend(out_utf8_buf, enmMaxWordsFileLength, offset, "%s", out_uft8_line);
346 }
347 WriteToFile(out_utf8_buf, offset, "test_data_ret.txt");
348 }
349
350 void SensitiveWordsChecker::StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...)
351 {
352 va_list argptr;
353 va_start(argptr, fmt);
354 if (offset < bufLen)
355 {
356 offset += vsprintf_s(buf + offset, bufLen - offset, fmt, argptr);
357 }
358 va_end(argptr);
359 }
View Code
测试效果:
完整VS2013工程:http://download.csdn.net/detail/tangxin19930330/9558997
原文链接: https://www.cnblogs.com/tangxin-blog/p/5615579.html
欢迎关注
微信关注下方公众号,第一时间获取干货硬货;公众号内回复【pdf】免费获取数百本计算机经典书籍
原创文章受到原创版权保护。转载请注明出处:https://www.ccppcoding.com/archives/235733
非原创文章文中已经注明原地址,如有侵权,联系删除
关注公众号【高性能架构探索】,第一时间获取最新文章
转载文章受原作者版权保护。转载请注明原作者出处!