1 module arff; 2 3 import std.algorithm; 4 import std.array; 5 import std.conv; 6 import std.csv; 7 import std.exception; 8 import std.math; 9 import std.range; 10 import std.stdio; 11 import std.string; 12 import std.uni; 13 14 /** 15 Indicates the type of an attribute. 16 */ 17 enum AttributeType 18 { 19 numeric, 20 nominal 21 } 22 23 /** 24 Contains metadata relating to an attribute in the ARFF file. 25 */ 26 struct Attribute 27 { 28 public 29 { 30 /** 31 Constructs an $(D Attribute) representing a numeric attribute with the given name. 32 33 Params: 34 name = The name of the attribute. 35 */ 36 this(string name) 37 { 38 mName = name; 39 mType = AttributeType.numeric; 40 } 41 42 /** 43 Constructs an $(D Attribute) representing a nominal attribute with the given name and possible values. 44 45 Params: 46 name = The name of the attribute. 47 categories = The possible values that this attribute can take. 48 */ 49 this(string name, string[] categories) 50 { 51 mName = name; 52 mType = AttributeType.nominal; 53 mCategories = categories.dup; 54 } 55 56 /** 57 For a nominal attribute, returns a floating point value used to represent the given category internally. 58 */ 59 float stringToFloat(string category) const 60 { 61 enforce(mType == AttributeType.nominal, 62 "Cannot perform stringToFloat because '" ~ name ~ "' is not a nominal attribute"); 63 64 auto ind = mCategories.countUntil(category); 65 66 enforce(ind != -1, "Unknown nominal value '" ~ category ~ "' for attribute '" ~ mName ~ "'"); 67 68 return cast(float)ind; 69 } 70 71 @property string name() const 72 { 73 return mName; 74 } 75 76 @property AttributeType type() const 77 { 78 return mType; 79 } 80 81 @property const(string[]) categories() const 82 { 83 return mCategories; 84 } 85 } 86 87 private 88 { 89 string mName; 90 AttributeType mType; 91 string[] mCategories; 92 } 93 } 94 95 /** 96 Stores a collection of instances loaded from an ARFF file. 97 */ 98 struct ARFF 99 { 100 public 101 { 102 this(string name, Attribute[] attribs, float[][] vals, uint lbls) 103 { 104 mName = name; 105 106 mAttributes = attribs.array; 107 108 mValues = vals 109 .map!array 110 .array; 111 112 mLabels = lbls; 113 } 114 115 float[] obscureLabels(float[] inst) 116 { 117 auto newInst = inst.dup; 118 newInst[$ - mLabels .. $] = float.nan; 119 120 return newInst; 121 } 122 123 @property string name() const 124 { 125 return mName; 126 } 127 128 @property Attribute[] attributes() 129 { 130 return mAttributes; 131 } 132 133 @property uint labels() 134 { 135 return mLabels; 136 } 137 138 @property uint features() 139 { 140 return cast(uint)mAttributes.length - mLabels; 141 } 142 143 @property float[][] values() 144 { 145 return mValues; 146 } 147 } 148 149 private 150 { 151 string mName; 152 Attribute[] mAttributes; 153 uint mLabels; 154 float[][] mValues; 155 } 156 } 157 158 private bool consume(R)(ref R line, string kw) 159 { 160 if(line.map!toLower.startsWith(kw.map!toLower)) 161 { 162 line.popFrontN(kw.length); 163 164 return true; 165 } 166 else 167 { 168 return false; 169 } 170 } 171 172 private string consumeWord(R)(ref R line) 173 { 174 auto ret = line.until!isWhite.to!string; 175 line.popFrontN(ret.length); 176 177 return ret; 178 } 179 180 private void skip(R)(ref R line) 181 { 182 while(!line.empty && line.front.isWhite) 183 { 184 line.popFront; 185 } 186 } 187 188 ARFF loadARFF(string path) 189 { 190 auto f = File(path, "r"); 191 int numLabels = int.min; 192 bool swapLabels; 193 bool dataMode = false; 194 string name; 195 string[] attribNames; 196 Attribute[] attribs; 197 float[][] vals; 198 199 auto inputRange = f.byLineCopy.map!strip; 200 201 foreach(l; inputRange) 202 { 203 if(!dataMode && l.length > 0 && l.front == '@') 204 { 205 if(l.consume("@relation")) 206 { 207 enforce(name == "", "Relation cannot have multiple @relation statements"); 208 209 enforce(attribs.length == 0, 210 "The @relation statement must occur before any @attribute statements"); 211 212 name = l.strip; 213 214 enforce(name != "", "The relation must have a name"); 215 216 if(name.front == name.back && name.length > 1 && name.front == '\'' || name.front == '"') 217 { 218 name = name[1 .. $ - 1]; 219 } 220 221 enforce(name != "", "The relation must have a name"); 222 223 import std.getopt; 224 auto args = name.splitter().array; 225 getopt(args, config.passThrough, "C", &numLabels); 226 227 swapLabels = numLabels > 0; 228 numLabels = numLabels == int.min ? 1 : abs(numLabels); 229 } 230 else if(l.consume("@attribute")) 231 { 232 enforce(name != "", "The @relation statement must occur before any @attibute statements"); 233 234 l.skip(); 235 auto attName = l.consumeWord; 236 l.skip(); 237 238 auto attSpec = l.strip; 239 240 if(attSpec.asLowerCase.equal("numeric") || attSpec.asLowerCase.equal("real")) 241 { 242 attribs ~= Attribute(attName); 243 } 244 else if(attSpec.front == '{' && attSpec.back == '}') 245 { 246 auto cats = attSpec[1 .. $ - 1].csvReader!string; 247 attribs ~= Attribute(attName, cats.front.map!(strip).array()); 248 } 249 else 250 { 251 throw new Exception("Unsupported attribute type\n" ~ l ~ "\n" ~ attName ~ "\n" ~ attSpec); 252 } 253 } 254 else if(l.consume("@data")) 255 { 256 dataMode = true; 257 } 258 } 259 else if(dataMode && l.length > 0) 260 { 261 float[] instVals = new float[attribs.length]; 262 instVals[] = 0.0f; 263 264 if(l[0] == '{' && l[$ - 1] == '}') 265 { 266 foreach(s; l[1 .. $ - 1].splitter(',')) 267 { 268 auto kv = s.splitter(); 269 auto key = kv.front.to!size_t; 270 kv.popFront; 271 auto val = kv.front; 272 273 if(attribs[key].type == AttributeType.nominal) 274 { 275 instVals[key] = attribs[key].stringToFloat(val); 276 } 277 else 278 { 279 instVals[key] = val.to!float; 280 } 281 } 282 } 283 else 284 { 285 instVals = zip(attribs, l.splitter(',').map!(x => x.strip())) 286 .map!(x => x[0].type == AttributeType.numeric ? x[1].to!float : x[0].stringToFloat(x[1])) 287 .array(); 288 } 289 290 if(swapLabels) 291 { 292 instVals = instVals[numLabels .. $] ~ instVals[0 .. numLabels]; 293 attribs = attribs[numLabels .. $] ~ attribs[0 .. numLabels]; 294 } 295 296 vals ~= instVals; 297 } 298 } 299 300 return ARFF(name, attribs, vals, cast(uint)numLabels); 301 }