1 module arff;
2 
3 import std.algorithm;
4 import std.array;
5 import std.conv;
6 import std.csv;
7 import std.exception;
8 import std.file;
9 import std.math;
10 import std.range;
11 import std.stdio;
12 import std.string;
13 import std.uni;
14 
15 /**
16 	Indicates the type of an attribute.
17 */
18 enum AttributeType
19 {
20     numeric,
21     nominal
22 }
23 
24 /**
25 	Contains metadata relating to an attribute in the ARFF file.
26 */
27 struct Attribute
28 {
29     public
30     {
31 		/**
32 			Constructs an $(D Attribute) representing a numeric attribute with the given name.
33 
34 			Params:
35 				name = The name of the attribute.
36 		*/
37         this(string name)
38         {
39             mName = name;
40             mType = AttributeType.numeric;
41         }
42 
43 		/**
44 			Constructs an $(D Attribute) representing a nominal attribute with the given name and possible values.
45 
46 			Params:
47 				name = The name of the attribute.
48 				categories = The possible values that this attribute can take.
49 		*/
50         this(string name, string[] categories)
51         {
52             mName = name;
53             mType = AttributeType.nominal;
54             mCategories = categories.dup;
55         }
56 
57 		/**
58 			For a nominal attribute, returns a floating point value used to represent the given category internally.
59 		*/
60         float stringToFloat(string category) const
61         {
62             enforce(mType == AttributeType.nominal,
63                 "Cannot perform stringToFloat because '" ~ name ~ "' is not a nominal attribute");
64 
65             auto ind = mCategories.countUntil(category);
66 
67             enforce(ind != -1, "Unknown nominal value '" ~ category ~ "' for attribute '" ~ mName ~ "'");
68 
69             return cast(float)ind;
70         }
71 
72         @property string name() const
73         {
74             return mName;
75         }
76 
77         @property AttributeType type() const
78         {
79             return mType;
80         }
81 
82         @property const(string[]) categories() const
83         {
84             return mCategories;
85         }
86     }
87 
88     private
89     {
90         string mName;
91         AttributeType mType;
92         string[] mCategories;
93     }
94 }
95 
96 /**
97 	Stores a collection of instances loaded from an ARFF file.
98 */
99 struct ARFF
100 {
101     public
102     {
103         this(string name, Attribute[] attribs, float[][] vals, uint lbls)
104         {
105             mName = name;
106 
107             mAttributes = attribs.array;
108 
109             mValues = vals
110                      .map!array
111                      .array;
112 
113             mLabels = lbls;
114         }
115 
116         float[] obscureLabels(float[] inst)
117         {
118             auto newInst = inst.dup;
119             newInst[$ - mLabels .. $] = float.nan;
120 
121             return newInst;
122         }
123 
124         @property string name() const
125         {
126             return mName;
127         }
128 
129         @property Attribute[] attributes()
130         {
131             return mAttributes;
132         }
133 
134         @property uint labels()
135         {
136             return mLabels;
137         }
138 
139         @property uint features()
140         {
141             return cast(uint)mAttributes.length - mLabels;
142         }
143 
144         @property float[][] values()
145         {
146             return mValues;
147         }
148     }
149 
150     private
151     {
152         string mName;
153         Attribute[] mAttributes;
154         uint mLabels;
155         float[][] mValues;
156     }
157 }
158 
159 private bool consume(R)(ref R line, string kw)
160 {
161     if(line.map!toLower.startsWith(kw.map!toLower))
162     {
163         line.popFrontN(kw.length);
164 
165         return true;
166     }
167     else
168     {
169         return false;
170     }
171 }
172 
173 private string consumeWord(R)(ref R line)
174 {
175     auto ret = line.until!isWhite.to!string;
176     line.popFrontN(ret.length);
177 
178     return ret;
179 }
180 
181 private void skip(R)(ref R line)
182 {
183     while(!line.empty && line.front.isWhite)
184     {
185         line.popFront;
186     }
187 }
188 
189 ARFF loadARFF(string path)
190 {
191     string content = readText(path);
192 
193     return parseARFF(content);
194 }
195 
196 ARFF parseARFF(string content)
197 {
198     int numLabels = 1;
199     bool swapLabels;
200     bool dataMode = false;
201     string name;
202     string[] attribNames;
203     Attribute[] attribs;
204     float[][] vals;
205 
206     auto inputRange = content
207                      .splitter("\n")
208                      .map!strip
209                      .filter!(x => x.length > 0);
210 
211     foreach(l; inputRange)
212     {
213         if(l.front == '%')
214         {
215             continue;
216         }
217 
218         if(!dataMode && l.length > 0 && l.front == '@')
219         {
220             if(l.consume("@relation"))
221             {
222                 enforce(name == "", "Relation cannot have multiple @relation statements");
223 
224                 enforce(attribs.length == 0,
225                     "The @relation statement must occur before any @attribute statements");
226 
227                 name = l.strip;
228 
229                 enforce(name != "", "The relation must have a name");
230 
231                 if(name.front == name.back && name.length > 1 && name.front == '\'' || name.front == '"')
232                 {
233                     name = name[1 .. $ - 1];
234                 }
235 
236                 enforce(name != "", "The relation must have a name");
237 
238                 import std.getopt;
239                 auto args = name.splitter().array;
240                 getopt(args, config.passThrough, "C", &numLabels);
241 
242                 swapLabels = numLabels < 0;
243                 numLabels = abs(numLabels);
244             }
245             else if(l.consume("@attribute"))
246             {
247                 enforce(name != "", "The @relation statement must occur before any @attibute statements");
248 
249                 l.skip();
250                 string attName;
251                 
252                 if(l.front == '"' || l.front == '\'')
253                 {
254                     auto q = l.front;
255                     l.popFront();
256 
257                     while(l.front != q)
258                     {
259                         attName ~= l.front;
260                         l.popFront();
261                     }
262 
263                     l.popFront();
264                 }
265                 else
266                 {
267                     attName = l.consumeWord;
268                 }
269 
270                 l.skip();
271 
272                 auto attSpec = l.strip;
273 
274                 if(["numeric", "real", "integer"].canFind(attSpec.asLowerCase().to!string))
275                 {
276                     attribs ~= Attribute(attName);
277                 }
278                 else if(attSpec.front == '{' && attSpec.back == '}')
279                 {
280                     auto cats = attSpec[1 .. $ - 1].csvReader!string;
281                     attribs ~= Attribute(attName, cats.front.map!(strip).array());
282                 }
283                 else
284                 {
285                     throw new Exception("Unsupported attribute type\n" ~ l ~ "\n" ~ attName ~ "\n" ~ attSpec);
286                 }
287             }
288             else if(l.consume("@data"))
289             {
290                 dataMode = true;
291             }
292         }
293         else if(dataMode && l.length > 0)
294         {
295             float[] instVals = new float[attribs.length];
296             instVals[] = 0.0f;
297 
298             if(l[0] == '{' && l[$ - 1] == '}')
299             {
300                 foreach(s; l[1 .. $ - 1].splitter(','))
301                 {
302                     auto kv = s.splitter();
303                     auto key = kv.front.to!size_t;
304                     kv.popFront;
305                     auto val = kv.front;
306 
307                     if(attribs[key].type == AttributeType.nominal)
308                     {
309                         instVals[key] = attribs[key].stringToFloat(val);
310                     }
311                     else
312                     {
313                         instVals[key] = val.to!float;
314                     }
315                 }
316             }
317             else
318             {
319                 instVals = zip(attribs, l.splitter(',').map!(x => x.strip()))
320                           .map!(x => x[0].type == AttributeType.numeric ? x[1].to!float : x[0].stringToFloat(x[1]))
321                           .array();
322             }
323 
324             if(swapLabels)
325             {
326                 instVals = instVals[numLabels .. $] ~ instVals[0 .. numLabels];
327             }
328 
329             vals ~= instVals;
330         }
331     }
332 
333     if(swapLabels)
334     {
335         attribs = attribs[numLabels .. $] ~ attribs[0 .. numLabels];
336     }
337     
338     return ARFF(name, attribs, vals, cast(uint)numLabels);
339 }
340 
341 unittest
342 {
343     string content = `
344         @relation arffdata
345 
346         @attribute "some attribute" numeric
347         @attribute 'label' REAL
348 
349         % Now the data starts!
350         @data
351         1,2
352         3,4
353         5,6
354 
355     `;
356 
357     auto arff = parseARFF(content);
358 
359     assert(arff.name == "arffdata");
360     assert(arff.attributes.length == 2);
361     assert(arff.attributes[0].name == "some attribute");
362     assert(arff.attributes[1].name == "label");
363     assert(arff.values.equal([
364         [1.0f, 2.0f],
365         [3.0f, 4.0f],
366         [5.0f, 6.0f]
367     ]));
368 }
369 
370 unittest
371 {
372     // The "-C -2" in the relation name is used to indicate the first two columns in the relation are the labels
373     string content = `
374         @relation "relname -C -2"
375         @attribute label1 {0,1}
376         @attribute label2 {0,1}
377         @attribute attrib1 numeric
378         @attribute attrib2 numeric
379         @data
380         0,1,5.2,4.3
381         1,0,3.6,8.1
382     `;
383 
384     auto arff = parseARFF(content);
385 
386     assert(arff.values.equal([
387         [5.2f, 4.3f, 0.0f, 1.0f],
388         [3.6f, 8.1f, 1.0f, 0.0f]
389     ]));
390 }