1 module arff; 2 3 import std.algorithm; 4 import std.array; 5 import std.conv; 6 import std.csv; 7 import std.exception; 8 import std.file; 9 import std.math; 10 import std.range; 11 import std.stdio; 12 import std.string; 13 import std.uni; 14 15 /** 16 Indicates the type of an attribute. 17 */ 18 enum AttributeType 19 { 20 numeric, 21 nominal 22 } 23 24 /** 25 Contains metadata relating to an attribute in the ARFF file. 26 */ 27 struct Attribute 28 { 29 public 30 { 31 /** 32 Constructs an $(D Attribute) representing a numeric attribute with the given name. 33 34 Params: 35 name = The name of the attribute. 36 */ 37 this(string name) 38 { 39 mName = name; 40 mType = AttributeType.numeric; 41 } 42 43 /** 44 Constructs an $(D Attribute) representing a nominal attribute with the given name and possible values. 45 46 Params: 47 name = The name of the attribute. 48 categories = The possible values that this attribute can take. 49 */ 50 this(string name, string[] categories) 51 { 52 mName = name; 53 mType = AttributeType.nominal; 54 mCategories = categories.dup; 55 } 56 57 /** 58 For a nominal attribute, returns a floating point value used to represent the given category internally. 59 */ 60 float stringToFloat(string category) const 61 { 62 enforce(mType == AttributeType.nominal, 63 "Cannot perform stringToFloat because '" ~ name ~ "' is not a nominal attribute"); 64 65 auto ind = mCategories.countUntil(category); 66 67 enforce(ind != -1, "Unknown nominal value '" ~ category ~ "' for attribute '" ~ mName ~ "'"); 68 69 return cast(float)ind; 70 } 71 72 @property string name() const 73 { 74 return mName; 75 } 76 77 @property AttributeType type() const 78 { 79 return mType; 80 } 81 82 @property const(string[]) categories() const 83 { 84 return mCategories; 85 } 86 } 87 88 private 89 { 90 string mName; 91 AttributeType mType; 92 string[] mCategories; 93 } 94 } 95 96 /** 97 Stores a collection of instances loaded from an ARFF file. 98 */ 99 struct ARFF 100 { 101 public 102 { 103 this(string name, Attribute[] attribs, float[][] vals, uint lbls) 104 { 105 mName = name; 106 107 mAttributes = attribs.array; 108 109 mValues = vals 110 .map!array 111 .array; 112 113 mLabels = lbls; 114 } 115 116 float[] obscureLabels(float[] inst) 117 { 118 auto newInst = inst.dup; 119 newInst[$ - mLabels .. $] = float.nan; 120 121 return newInst; 122 } 123 124 @property string name() const 125 { 126 return mName; 127 } 128 129 @property Attribute[] attributes() 130 { 131 return mAttributes; 132 } 133 134 @property uint labels() 135 { 136 return mLabels; 137 } 138 139 @property uint features() 140 { 141 return cast(uint)mAttributes.length - mLabels; 142 } 143 144 @property float[][] values() 145 { 146 return mValues; 147 } 148 } 149 150 private 151 { 152 string mName; 153 Attribute[] mAttributes; 154 uint mLabels; 155 float[][] mValues; 156 } 157 } 158 159 private bool consume(R)(ref R line, string kw) 160 { 161 if(line.map!toLower.startsWith(kw.map!toLower)) 162 { 163 line.popFrontN(kw.length); 164 165 return true; 166 } 167 else 168 { 169 return false; 170 } 171 } 172 173 private string consumeWord(R)(ref R line) 174 { 175 auto ret = line.until!isWhite.to!string; 176 line.popFrontN(ret.length); 177 178 return ret; 179 } 180 181 private void skip(R)(ref R line) 182 { 183 while(!line.empty && line.front.isWhite) 184 { 185 line.popFront; 186 } 187 } 188 189 ARFF loadARFF(string path) 190 { 191 string content = readText(path); 192 193 return parseARFF(content); 194 } 195 196 ARFF parseARFF(string content) 197 { 198 int numLabels = 1; 199 bool swapLabels; 200 bool dataMode = false; 201 string name; 202 string[] attribNames; 203 Attribute[] attribs; 204 float[][] vals; 205 206 auto inputRange = content 207 .splitter("\n") 208 .map!strip 209 .filter!(x => x.length > 0); 210 211 foreach(l; inputRange) 212 { 213 if(l.front == '%') 214 { 215 continue; 216 } 217 218 if(!dataMode && l.length > 0 && l.front == '@') 219 { 220 if(l.consume("@relation")) 221 { 222 enforce(name == "", "Relation cannot have multiple @relation statements"); 223 224 enforce(attribs.length == 0, 225 "The @relation statement must occur before any @attribute statements"); 226 227 name = l.strip; 228 229 enforce(name != "", "The relation must have a name"); 230 231 if(name.front == name.back && name.length > 1 && name.front == '\'' || name.front == '"') 232 { 233 name = name[1 .. $ - 1]; 234 } 235 236 enforce(name != "", "The relation must have a name"); 237 238 import std.getopt; 239 auto args = name.splitter().array; 240 getopt(args, config.passThrough, "C", &numLabels); 241 242 swapLabels = numLabels < 0; 243 numLabels = abs(numLabels); 244 } 245 else if(l.consume("@attribute")) 246 { 247 enforce(name != "", "The @relation statement must occur before any @attibute statements"); 248 249 l.skip(); 250 string attName; 251 252 if(l.front == '"' || l.front == '\'') 253 { 254 auto q = l.front; 255 l.popFront(); 256 257 while(l.front != q) 258 { 259 attName ~= l.front; 260 l.popFront(); 261 } 262 263 l.popFront(); 264 } 265 else 266 { 267 attName = l.consumeWord; 268 } 269 270 l.skip(); 271 272 auto attSpec = l.strip; 273 274 if(["numeric", "real", "integer"].canFind(attSpec.asLowerCase().to!string)) 275 { 276 attribs ~= Attribute(attName); 277 } 278 else if(attSpec.front == '{' && attSpec.back == '}') 279 { 280 auto cats = attSpec[1 .. $ - 1].csvReader!string; 281 attribs ~= Attribute(attName, cats.front.map!(strip).array()); 282 } 283 else 284 { 285 throw new Exception("Unsupported attribute type\n" ~ l ~ "\n" ~ attName ~ "\n" ~ attSpec); 286 } 287 } 288 else if(l.consume("@data")) 289 { 290 dataMode = true; 291 } 292 } 293 else if(dataMode && l.length > 0) 294 { 295 float[] instVals = new float[attribs.length]; 296 instVals[] = 0.0f; 297 298 if(l[0] == '{' && l[$ - 1] == '}') 299 { 300 foreach(s; l[1 .. $ - 1].splitter(',')) 301 { 302 auto kv = s.splitter(); 303 auto key = kv.front.to!size_t; 304 kv.popFront; 305 auto val = kv.front; 306 307 if(attribs[key].type == AttributeType.nominal) 308 { 309 instVals[key] = attribs[key].stringToFloat(val); 310 } 311 else 312 { 313 instVals[key] = val.to!float; 314 } 315 } 316 } 317 else 318 { 319 instVals = zip(attribs, l.splitter(',').map!(x => x.strip())) 320 .map!(x => x[0].type == AttributeType.numeric ? x[1].to!float : x[0].stringToFloat(x[1])) 321 .array(); 322 } 323 324 if(swapLabels) 325 { 326 instVals = instVals[numLabels .. $] ~ instVals[0 .. numLabels]; 327 } 328 329 vals ~= instVals; 330 } 331 } 332 333 if(swapLabels) 334 { 335 attribs = attribs[numLabels .. $] ~ attribs[0 .. numLabels]; 336 } 337 338 return ARFF(name, attribs, vals, cast(uint)numLabels); 339 } 340 341 unittest 342 { 343 string content = ` 344 @relation arffdata 345 346 @attribute "some attribute" numeric 347 @attribute 'label' REAL 348 349 % Now the data starts! 350 @data 351 1,2 352 3,4 353 5,6 354 355 `; 356 357 auto arff = parseARFF(content); 358 359 assert(arff.name == "arffdata"); 360 assert(arff.attributes.length == 2); 361 assert(arff.attributes[0].name == "some attribute"); 362 assert(arff.attributes[1].name == "label"); 363 assert(arff.values.equal([ 364 [1.0f, 2.0f], 365 [3.0f, 4.0f], 366 [5.0f, 6.0f] 367 ])); 368 } 369 370 unittest 371 { 372 // The "-C -2" in the relation name is used to indicate the first two columns in the relation are the labels 373 string content = ` 374 @relation "relname -C -2" 375 @attribute label1 {0,1} 376 @attribute label2 {0,1} 377 @attribute attrib1 numeric 378 @attribute attrib2 numeric 379 @data 380 0,1,5.2,4.3 381 1,0,3.6,8.1 382 `; 383 384 auto arff = parseARFF(content); 385 386 assert(arff.values.equal([ 387 [5.2f, 4.3f, 0.0f, 1.0f], 388 [3.6f, 8.1f, 1.0f, 0.0f] 389 ])); 390 }