Context Navigation

source: npl/mailserver/dspam/dspam-3.10.2/src/libdspam_objects.h @ 1a19ecd

perl-5.22

Last change on this file since 1a19ecd was c5c522c, checked in by Edwin Eefting <edwin@datux.nl>, 9 years ago
initial commit, transferred from cleaned syn3 svn tree
Property mode set to `100644`
File size: 11.7 KB

Line
1	/* $Id: libdspam_objects.h,v 1.27 2011/07/11 21:29:57 sbajic Exp $ */
2
3	/*
4	DSPAM
5	COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6
7	This program is free software: you can redistribute it and/or modify
8	it under the terms of the GNU Affero General Public License as
9	published by the Free Software Foundation, either version 3 of the
10	License, or (at your option) any later version.
11
12	This program is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	GNU Affero General Public License for more details.
16
17	You should have received a copy of the GNU Affero General Public License
18	along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20	*/
21
22	#ifndef _LIBDSPAM_OBJECTS_H
23	# define _LIBDSPAM_OBJECTS_H
24
25	#ifdef HAVE_CONFIG_H
26	#include <auto-config.h>
27	#endif
28
29	#include <time.h>
30	#include "config.h"
31	#include "config_shared.h"
32	#include "decode.h"
33
34	#if ((defined(__sun__) && defined(__svr4__)) \|\| (defined(__sun) && defined(__SUNPRO_C))) && !defined(u_int32_t) && !defined(__BIT_TYPES_DEFINED__)
35	#define __BIT_TYPES_DEFINED__
36	typedef unsigned long long u_int64_t;
37	typedef unsigned int u_int32_t;
38	typedef unsigned short u_int16_t;
39	typedef unsigned char u_int8_t;
40	#endif
41
42	#ifdef _WIN32
43	typedef unsigned int u_int32_t;
44	typedef u_int32_t uid_t;
45	#endif
46
47	extern void _drv_handle; / Handle to storage driver library */
48
49	/*
50	* struct dspam_factor - A single determining factor
51	*
52	* An element containing a determining factor in the dominant calculation of
53	* a message. An array of these are returned to the calling application to
54	* explain libdspam's final classification decision.
55	*/
56
57	struct dspam_factor {
58	char *token_name;
59	float value;
60	};
61
62	/*
63	* struct _ds_spam_totals - User spam totals
64	*
65	* Spam totals loaded into the user's filter context upon a call to
66	* dspam_init(). This structure represents the user's cumulative statistics.
67	*
68	* spam_learned, innocent_learned
69	* The total number of messages trained on.
70	*
71	* spam_misclassified, innocent_misclassified
72	* The total number of messages that were misclassified by DSPAM, and
73	* submitted for retraining.
74	*
75	* spam_classified, innocent_classified
76	* The total number of messages that were classified by DSPAM, but not
77	* learned. Used exclusively with Train-on-Error mode.
78	*
79	* spam_corpusfed, innocent_corpusfed
80	* The total number of messages supplied by the end-user for training.
81	*
82	* NOTE: The ordering of the variables in the structure must remain
83	* consistent to ensure backward-compatibility with some storage
84	* drivers (such as the Berkeley DB drivers)
85	*/
86
87	struct _ds_spam_totals
88	{
89	long spam_learned;
90	long innocent_learned;
91	long spam_misclassified;
92	long innocent_misclassified;
93	long spam_corpusfed;
94	long innocent_corpusfed;
95	long spam_classified;
96	long innocent_classified;
97	};
98
99	/*
100	* struct _ds_spam_stat - Statistics for a single token:
101	*
102	* probability
103	* The calculated probability of the token based on the active pvalue
104	* algorithm (selected at configure-time).
105	*
106	* spam_hits, innocent_hits
107	* The total number of times the token has appeared in each class of
108	* message. If Train-on-Error or Train-until-Mature training modes are
109	* employed, these values will not necessarily be updated for every
110	* message.
111	*
112	* status
113	* TST_DISK Value was loaded from the storage interface
114	* TST_DIRTY Statistic is dirty (not written to disk since last modified)
115	*/
116
117	typedef struct _ds_spam_stat
118	{
119	double probability;
120	long spam_hits;
121	long innocent_hits;
122	char status;
123	unsigned long offset;
124	} *ds_spam_stat_t;
125
126	/*
127	* struct _ds_spam_signature - A historical classification signature
128	*
129	* A binary representation of the original training instance. The spam
130	* signature contains all the metadata used in the original decision
131	* about the message, so that a 1:1 retraining can take place if the
132	* message is submitted for retraining (e.g. was misclassified). The
133	* signature contains a series of _ds_signature_token structures, which
134	* house the original set of tokens used and their frequency counts in
135	* the message. A spam signature is a temporary piece of data that is
136	* usually purged from disk after a short period of time.
137	*/
138
139	struct _ds_spam_signature
140	{
141	void *data;
142	unsigned long length;
143	};
144
145	/*
146	* struct _ds_signature_token - An entry in the classification signature
147	*
148	* A signature token is a single entry in the binary _ds_spam_signature
149	* data blob, representing a single data point from the original
150	* training instance.
151	*
152	* token
153	* The checksum of the original token in the message
154	*
155	* frequency
156	* The token's frequency in the original message
157	*/
158
159	struct _ds_signature_token
160	{
161	unsigned long long token;
162	unsigned char frequency;
163	};
164
165	/*
166	* struct _ds_config - libdspam attributes configuration
167	*
168	* Each classification context may have an attributes configuration
169	* which is read by various components of libdspam. This structure
170	* contains an array of attributes and the size of the array.
171	*/
172
173	struct _ds_config
174	{
175	config_t attributes;
176	long size;
177	};
178
179	/*
180	* DSPAM_CTX - The DSPAM Classification Context
181	*
182	* A classification context is attached directly to a filter instance
183	* and supplies the entire context for the filter instance to operate
184	* under. This includes the user and group, operational flags,
185	* training mode, and the message being operated on. The filter
186	* instance also sets specific output variables within the context
187	* such as the result of a classification, confidence level, and
188	* etcetera.
189	*
190	* username, group (input)
191	* The current username and group that is being operated on.
192	*
193	* totals (output)
194	* The set of statistics loaded when dspam_init() is called.
195	*
196	* signature (input, output)
197	* The signature represents a DSPAM signature, and can be supplied
198	* as an input variable for retraining (e.g. in the event of a
199	* misclassification) or used as an output variable to store a
200	* signature generated by the filter instance during normal
201	* classification.
202	*
203	* message (input)
204	* The message being operated on, post-actualization. This can be
205	* left NULL, and libdspam will automatically actualize the message
206	*
207	* probability (output)
208	* The probability of the resulting operation. This is generally a
209	* floating point number between 0 and 1, 1 being the highest
210	* probability of high order classification.
211	*
212	* result (output)
213	* The final result of the requested operation. This is generally
214	* either DSR_ISSPAM, DSR_ISINNOCENT, or DSR_WHITELISTED.
215	*
216	* confidence (output)
217	* The confidence that the filter has in its returned result.
218	* NOTE: Confidence is not always supported, and may be zero.
219	*
220	* operating_mode (input)
221	* Sets the operating mode of the filter instance. This can be one
222	* of the following:
223	*
224	* DSM_PROCESS Classify and learn the supplied message using
225	* whatever training mode is specified
226	*
227	* DSM_CLASSIFY Classify the supplied message only; do not
228	* learn or update any counters.
229	*
230	* DSM_TOOLS Identifies that the calling function is from
231	* a utility, and no operation will be requested.
232	*
233	* training_mode (input)
234	* The training mode sets the type of training the filter instance
235	* should apply to the process. This can be one of:
236	*
237	* DST_TEFT Train-on-Everything
238	* Trains every single message processed
239	*
240	* DST_TOE Train-on-Error
241	* Trains only on a misclassification or
242	* corpus-fed message.
243	*
244	* DST_TUM Train-until-Mature
245	* Trains individual tokens based on the
246	* maturity of the user's dictionary
247	*
248	* DST_NOTRAIN No Training
249	* Process the message but do not perform
250	* any training.
251	* training_buffer (input)
252	* Sets the amount of training-loop buffering. This number is a
253	* range from 0-10 and changes the amount of token sedation used
254	* during the training loop. The higher the number, the more token
255	* statistics are watered down during initial training to prevent
256	* false positives. Setting this value to zero results in no
257	* sedation being performed.
258	*
259	* flags (input)
260	* Applies different fine-tuning behavior to the context:
261	*
262	* DSF_NOISE Apply Bayesian Noise Reduction logic
263	* DSF_SIGNATURE Signature is provided/requested
264	* DSF_WHITELIST Use automatic whitelisting logic
265	* DSF_MERGED Merge user/group data in memory
266	* DSF_UNLEARN Unlearn the message
267	* DSF_BIAS Assign processor bias to unknown tokens
268	*
269	* tokenizer (input)
270	* Specifies which tokenizer to use
271	*
272	* DSZ_WORD Use WORD (uniGram) tokenizer
273	* DSZ_CHAIN Use CHAIN (biGram) tokenizer
274	* DSZ_SBPH Use SBPH (Sparse Binary Polynomial Hashing) tokenizer
275	* DSZ_OSB Use OSB (Orthogonal Sparse biGram) tokenizer
276	*
277	* algorithms (input)
278	* Optional API to override the default algorithms. This value is set
279	* with the default compiled values whenever dspam_create() is called.
280	*
281	* DSA_GRAHAM Graham-Bayesian
282	* DSA_BURTON Burton-Bayesian
283	* DSA_ROBINSON Robinson's Geometric Mean Test
284	* DSA_CHI_SQUARE Fisher-Robinson's Chi-Square
285	* DSA_NAIVE Naive-Bayesian
286	*
287	* P-Value Computations:
288	*
289	* DSP_ROBINSON Robinson's Technique
290	* DSP_GRAHAM Graham's Technique
291	* DSP_MARKOV Markov Weighted Technique
292	*
293	* locked (output)
294	* Identifies that the user's storage is presently locked
295	*/
296
297	typedef struct
298	{
299	struct _ds_spam_totals totals;
300	struct _ds_spam_signature * signature;
301	struct _ds_message * message;
302	struct _ds_config * config;
303
304	char *username;
305	char *group;
306	char home; / DSPAM Home */
307	int operating_mode; /* DSM_ */
308	int training_mode; /* DST_ */
309	int training_buffer; /* 0-10 */
310	int wh_threshold; /* Whitelisting Threshold (default 10) */
311	int classification; /* DSR_ */
312	int source; /* DSS_ */
313	int learned; /* Did we actually learn something? */
314	int tokenizer; /* DSZ_ */
315	u_int32_t flags;
316	u_int32_t algorithms;
317
318	int result;
319	char class[32];
320	float probability;
321	float confidence;
322
323	int locked;
324	void * storage;
325	time_t _process_start;
326	int _sig_provided;
327
328	struct nt * factors;
329
330	} DSPAM_CTX;
331
332	/* Processing Flags */
333
334	#define DSF_SIGNATURE 0x02
335	#define DSF_BIAS 0x04
336	#define DSF_NOISE 0x08
337	#define DSF_WHITELIST 0x10
338	#define DSF_MERGED 0x20
339	#define DSF_UNLEARN 0x80
340
341	/* Tokenizers */
342
343	#define DSZ_WORD 0x01
344	#define DSZ_CHAIN 0x02
345	#define DSZ_SBPH 0x03
346	#define DSZ_OSB 0x04
347
348	/* Algorithms */
349
350	#define DSA_GRAHAM 0x01
351	#define DSA_BURTON 0x02
352	#define DSA_ROBINSON 0x04
353	#define DSA_CHI_SQUARE 0x08
354	#define DSP_ROBINSON 0x10
355	#define DSP_GRAHAM 0x20
356	#define DSP_MARKOV 0x40
357	#define DSA_NAIVE 0x80
358
359	/* Operating Modes */
360
361	#define DSM_PROCESS 0x00
362	#define DSM_TOOLS 0x01
363	#define DSM_CLASSIFY 0x02
364	#define DSM_NONE 0xFF
365
366	/* Training Modes */
367
368	#define DST_TEFT 0x00
369	#define DST_TOE 0x01
370	#define DST_TUM 0x02
371	#define DST_NOTRAIN 0xFE
372
373	/* Classification Results */
374
375	#define DSR_ISSPAM 0x01
376	#define DSR_ISINNOCENT 0x02
377	#define DSR_NONE 0xFF
378
379	/* Classification Sources */
380
381	#define DSS_ERROR 0x00 /* Retraining an error */
382	#define DSS_CORPUS 0x01 /* Training a message from corpus */
383	#define DSS_INOCULATION 0x02 /* Message is an inoculation */
384	#define DSS_NONE 0xFF /* Standard inbound processing */
385
386	/* Statuses for token-status bit */
387	#define TST_DISK 0x01
388	#define TST_DIRTY 0x02
389
390	/* Token Types */
391	#define DTT_DEFAULT 0x00
392	#define DTT_BNR 0x01
393
394	#define DSP_UNCALCULATED -1
395
396	#define BURTON_WINDOW_SIZE 27
397
398	#endif /* _LIBDSPAM_OBJECTS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: