1 module arff;
2 
3 import std.algorithm;
4 import std.array;
5 import std.conv;
6 import std.csv;
7 import std.exception;
8 import std.math;
9 import std.range;
10 import std.stdio;
11 import std.string;
12 import std.uni;
13 
14 /**
15 	Indicates the type of an attribute.
16 */
17 enum AttributeType
18 {
19     numeric,
20     nominal
21 }
22 
23 /**
24 	Contains metadata relating to an attribute in the ARFF file.
25 */
26 struct Attribute
27 {
28     public
29     {
30 		/**
31 			Constructs an $(D Attribute) representing a numeric attribute with the given name.
32 
33 			Params:
34 				name = The name of the attribute.
35 		*/
36         this(string name)
37         {
38             mName = name;
39             mType = AttributeType.numeric;
40         }
41 
42 		/**
43 			Constructs an $(D Attribute) representing a nominal attribute with the given name and possible values.
44 
45 			Params:
46 				name = The name of the attribute.
47 				categories = The possible values that this attribute can take.
48 		*/
49         this(string name, string[] categories)
50         {
51             mName = name;
52             mType = AttributeType.nominal;
53             mCategories = categories.dup;
54         }
55 
56 		/**
57 			For a nominal attribute, returns a floating point value used to represent the given category internally.
58 		*/
59         float stringToFloat(string category) const
60         {
61             enforce(mType == AttributeType.nominal,
62                 "Cannot perform stringToFloat because '" ~ name ~ "' is not a nominal attribute");
63 
64             auto ind = mCategories.countUntil(category);
65 
66             enforce(ind != -1, "Unknown nominal value '" ~ category ~ "' for attribute '" ~ mName ~ "'");
67 
68             return cast(float)ind;
69         }
70 
71         @property string name() const
72         {
73             return mName;
74         }
75 
76         @property AttributeType type() const
77         {
78             return mType;
79         }
80 
81         @property const(string[]) categories() const
82         {
83             return mCategories;
84         }
85     }
86 
87     private
88     {
89         string mName;
90         AttributeType mType;
91         string[] mCategories;
92     }
93 }
94 
95 /**
96 	Stores a collection of instances loaded from an ARFF file.
97 */
98 struct ARFF
99 {
100     public
101     {
102         this(string name, Attribute[] attribs, float[][] vals, uint lbls)
103         {
104             mName = name;
105 
106             mAttributes = attribs.array;
107 
108             mValues = vals
109                      .map!array
110                      .array;
111 
112             mLabels = lbls;
113         }
114 
115         float[] obscureLabels(float[] inst)
116         {
117             auto newInst = inst.dup;
118             newInst[$ - mLabels .. $] = float.nan;
119 
120             return newInst;
121         }
122 
123         @property string name() const
124         {
125             return mName;
126         }
127 
128         @property Attribute[] attributes()
129         {
130             return mAttributes;
131         }
132 
133         @property uint labels()
134         {
135             return mLabels;
136         }
137 
138         @property uint features()
139         {
140             return cast(uint)mAttributes.length - mLabels;
141         }
142 
143         @property float[][] values()
144         {
145             return mValues;
146         }
147     }
148 
149     private
150     {
151         string mName;
152         Attribute[] mAttributes;
153         uint mLabels;
154         float[][] mValues;
155     }
156 }
157 
158 private bool consume(R)(ref R line, string kw)
159 {
160     if(line.map!toLower.startsWith(kw.map!toLower))
161     {
162         line.popFrontN(kw.length);
163 
164         return true;
165     }
166     else
167     {
168         return false;
169     }
170 }
171 
172 private string consumeWord(R)(ref R line)
173 {
174     auto ret = line.until!isWhite.to!string;
175     line.popFrontN(ret.length);
176 
177     return ret;
178 }
179 
180 private void skip(R)(ref R line)
181 {
182     while(!line.empty && line.front.isWhite)
183     {
184         line.popFront;
185     }
186 }
187 
188 ARFF loadARFF(string path)
189 {
190     auto f = File(path, "r");
191     int numLabels = int.min;
192     bool swapLabels;
193     bool dataMode = false;
194     string name;
195     string[] attribNames;
196     Attribute[] attribs;
197     float[][] vals;
198 
199     auto inputRange = f.byLineCopy.map!strip;
200 
201     foreach(l; inputRange)
202     {
203         if(!dataMode && l.length > 0 && l.front == '@')
204         {
205             if(l.consume("@relation"))
206             {
207                 enforce(name == "", "Relation cannot have multiple @relation statements");
208 
209                 enforce(attribs.length == 0,
210                     "The @relation statement must occur before any @attribute statements");
211 
212                 name = l.strip;
213 
214                 enforce(name != "", "The relation must have a name");
215 
216                 if(name.front == name.back && name.length > 1 && name.front == '\'' || name.front == '"')
217                 {
218                     name = name[1 .. $ - 1];
219                 }
220 
221                 enforce(name != "", "The relation must have a name");
222 
223                 import std.getopt;
224                 auto args = name.splitter().array;
225                 getopt(args, config.passThrough, "C", &numLabels);
226 
227                 swapLabels = numLabels > 0;
228                 numLabels = numLabels == int.min ? 1 : abs(numLabels);
229             }
230             else if(l.consume("@attribute"))
231             {
232                 enforce(name != "", "The @relation statement must occur before any @attibute statements");
233 
234                 l.skip();
235                 auto attName = l.consumeWord;
236                 l.skip();
237 
238                 auto attSpec = l.strip;
239 
240                 if(attSpec.asLowerCase.equal("numeric") || attSpec.asLowerCase.equal("real"))
241                 {
242                     attribs ~= Attribute(attName);
243                 }
244                 else if(attSpec.front == '{' && attSpec.back == '}')
245                 {
246                     auto cats = attSpec[1 .. $ - 1].csvReader!string;
247                     attribs ~= Attribute(attName, cats.front.map!(strip).array());
248                 }
249                 else
250                 {
251                     throw new Exception("Unsupported attribute type\n" ~ l ~ "\n" ~ attName ~ "\n" ~ attSpec);
252                 }
253             }
254             else if(l.consume("@data"))
255             {
256                 dataMode = true;
257             }
258         }
259         else if(dataMode && l.length > 0)
260         {
261             float[] instVals = new float[attribs.length];
262             instVals[] = 0.0f;
263 
264             if(l[0] == '{' && l[$ - 1] == '}')
265             {
266                 foreach(s; l[1 .. $ - 1].splitter(','))
267                 {
268                     auto kv = s.splitter();
269                     auto key = kv.front.to!size_t;
270                     kv.popFront;
271                     auto val = kv.front;
272 
273                     if(attribs[key].type == AttributeType.nominal)
274                     {
275                         instVals[key] = attribs[key].stringToFloat(val);
276                     }
277                     else
278                     {
279                         instVals[key] = val.to!float;
280                     }
281                 }
282             }
283             else
284             {
285                 instVals = zip(attribs, l.splitter(',').map!(x => x.strip()))
286                           .map!(x => x[0].type == AttributeType.numeric ? x[1].to!float : x[0].stringToFloat(x[1]))
287                           .array();
288             }
289 
290             if(swapLabels)
291             {
292                 instVals = instVals[numLabels .. $] ~ instVals[0 .. numLabels];
293                 attribs = attribs[numLabels .. $] ~ attribs[0 .. numLabels];
294             }
295 
296             vals ~= instVals;
297         }
298     }
299     
300     return ARFF(name, attribs, vals, cast(uint)numLabels);
301 }