aboutcode-toolkit/src/attributecode/transform.py at develop · aboutcode-org/aboutcode-toolkit

History

444 lines (371 loc) · 14.5 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

#!/usr/bin/env python

# -*- coding: utf8 -*-

# ============================================================================

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# ============================================================================

import json

from collections import Counter, OrderedDict

from itertools import zip_longest

import attr

import openpyxl

from attributecode import CRITICAL

from attributecode import Error

from attributecode import saneyaml

from attributecode.util import csv

from attributecode.util import replace_tab_with_spaces

def transform_csv(location):

"""

Read a CSV file at `location` and convert data into list of dictionaries.

"""

errors = []

new_data = []

rows = read_csv_rows(location)

data = iter(rows)

names = next(rows)

field_names = strip_trailing_fields_csv(names)

dupes = check_duplicate_fields(field_names)

if dupes:

msg = "Duplicated field name: %(name)s"

for name in dupes:

errors.append(Error(CRITICAL, msg % locals()))

if not errors:

# Convert to dicts

new_data = [dict(zip_longest(field_names, item)) for item in data]

return new_data, errors

def transform_json(location):

"""

Read a JSON file at `location` and convert data into list of dictionaries.

"""

errors = []

new_data = []

items = read_json(location)

data = normalize_dict_data(items)

new_data = strip_trailing_fields_json(data)

return new_data, errors

def transform_excel(location, worksheet=None):

"""

Read a XLSX file at `location` and convert data into list of dictionaries.

"""

errors = []

new_data = []

dupes, new_data = read_excel(location, worksheet)

if dupes:

msg = "Duplicated field name: %(name)s"

for name in dupes:

errors.append(Error(CRITICAL, msg % locals()))

return new_data, errors

def strip_trailing_fields_csv(names):

"""

Strip trailing spaces for field names #456

"""

field_names = []

for name in names:

field_names.append(name.strip())

return field_names

def strip_trailing_fields_json(items):

"""

Strip trailing spaces for field name #456

"""

data = []

for item in items:

od = {}

for field in item:

stripped_field_name = field.strip()

od[stripped_field_name] = item[field]

data.append(od)

return data

def normalize_dict_data(data):

"""

Check if the input data from scancode-toolkit and normalize to a normal

dictionary if it is.

Return a list type of normalized dictionary.

"""

try:

# Check if this is a JSON output from scancode-toolkit

if data["headers"][0]["tool_name"] == "scancode-toolkit":

# only takes data inside "files"

new_data = data["files"]

except:

new_data = data

if not isinstance(new_data, list):

new_data = [new_data]

return new_data

def transform_data(data, transformer):

"""

Read a dictionary and apply transformations using the

`transformer` Transformer.

Return a tuple of:

([field names...], [transformed ordered dict...], [Error objects..])

"""

renamed_field_data = transformer.apply_renamings(data)

if transformer.field_filters:

renamed_field_data = list(transformer.filter_fields(renamed_field_data))

if transformer.exclude_fields:

renamed_field_data = list(transformer.filter_excluded(renamed_field_data))

errors = transformer.check_required_fields(renamed_field_data)

if errors:

return data, errors

return renamed_field_data, errors

tranformer_config_help = """

A transform configuration file is used to describe which transformations and

validations to apply to a source CSV file. This is a simple text file using YAML

format, using the same format as an .ABOUT file.

The attributes that can be set in a configuration file are:

* field_renamings:

An optional map of source CSV or JSON field name to target CSV/JSON new field name that

is used to rename CSV fields.

For instance with this configuration the fields "Directory/Location" will be

renamed to "about_resource" and "foo" to "bar":

field_renamings:

about_resource : 'Directory/Location'

bar : foo

The renaming is always applied first before other transforms and checks. All

other field names referenced below are these that exist AFTER the renamings

have been applied to the existing field names.

* required_fields:

An optional list of required field names that must have a value, beyond the

standard fields names. If a source CSV/JSON does not have such a field or a row is

missing a value for a required field, an error is reported.

For instance with this configuration an error will be reported if the fields

"name" and "version" are missing or if any row does not have a value set for

these fields:

required_fields:

- name

- version

* field_filters:

An optional list of field names that should be kept in the transformed CSV/JSON. If

this list is provided, all the fields from the source CSV/JSON that should be kept

in the target CSV/JSON must be listed regardless of either standard or required

fields. If this list is not provided, all source CSV/JSON fields are kept in the

transformed target CSV/JSON.

For instance with this configuration the target CSV/JSON will only contains the "name"

and "version" fields and no other field:

field_filters:

- name

- version

* exclude_fields:

An optional list of field names that should be excluded in the transformed CSV/JSON. If

this list is provided, all the fields from the source CSV/JSON that should be excluded

in the target CSV/JSON must be listed. Excluding standard or required fields will cause

an error. If this list is not provided, all source CSV/JSON fields are kept in the

transformed target CSV/JSON.

For instance with this configuration the target CSV/JSON will not contain the "type"

and "temp" fields:

exclude_fields:

- type

- temp

"""

@attr.attributes

class Transformer(object):

__doc__ = tranformer_config_help

field_renamings = attr.attrib(default=attr.Factory(dict))

required_fields = attr.attrib(default=attr.Factory(list))

field_filters = attr.attrib(default=attr.Factory(list))

exclude_fields = attr.attrib(default=attr.Factory(list))

# a list of all the standard fields from AboutCode toolkit

standard_fields = attr.attrib(default=attr.Factory(list), init=False)

# a list of the subset of standard fields that are essential and MUST be

# present for AboutCode toolkit to work

essential_fields = attr.attrib(default=attr.Factory(list), init=False)

# called by attr after the __init__()

def __attrs_post_init__(self, *args, **kwargs):

from attributecode.model import About

about = About()

self.essential_fields = list(about.required_fields)

self.standard_fields = [f.name for f in about.all_fields()]

@classmethod

def default(cls):

"""

Return a default Transformer with built-in transforms.

"""

return cls(

field_renamings={},

required_fields=[],

field_filters=[],

exclude_fields=[],

)

@classmethod

def from_file(cls, location):

"""

Load and return a Transformer instance from a YAML configuration file at

`location`.

"""

with open(location, encoding="utf-8", errors="replace") as conf:

data = saneyaml.load(replace_tab_with_spaces(conf.read()))

return cls(

field_renamings=data.get("field_renamings", {}),

required_fields=data.get("required_fields", []),

field_filters=data.get("field_filters", []),

exclude_fields=data.get("exclude_fields", []),

)

def check_required_fields(self, data):

"""

Return a list of Error for a `data` list of ordered dict where a

dict is missing a value for a required field name.

"""

errors = []

required = set(self.essential_fields + self.required_fields)

if not required:

return []

for rn, item in enumerate(data):

missings = [rk for rk in required if not item.get(rk)]

if not missings:

continue

missings = ", ".join(missings)

msg = "Row {rn} is missing required values for fields: {missings}"

errors.append(Error(CRITICAL, msg.format(**locals())))

return errors

def apply_renamings(self, data):

"""

Return a tranformed list of `field_names` where fields are renamed

based on this Transformer configuration.

"""

renamings = self.field_renamings

renamed_to_list = list(renamings.keys())

renamed_from_list = list(renamings.values())

if not renamings:

return data

if isinstance(data, dict):

renamed_obj = {}

for key, value in data.items():

if key in renamed_from_list:

for idx, renamed_from_key in enumerate(renamed_from_list):

if key == renamed_from_key:

renamed_key = renamed_to_list[idx]

renamed_obj[renamed_key] = self.apply_renamings(value)

else:

renamed_obj[key] = self.apply_renamings(value)

return renamed_obj

elif isinstance(data, list):

return [self.apply_renamings(item) for item in data]

else:

return data

"""

def clean_fields(self, field_names):

Apply standard cleanups to a list of fields and return these.

if not field_names:

return field_names

return [c.strip().lower() for c in field_names]

"""

def filter_fields(self, data):

"""

Yield transformed dicts from a `data` list of dicts keeping only

fields with a name in the `field_filters`of this Transformer.

Return the data unchanged if no `field_filters` exists.

"""

# field_filters = set(self.clean_fields(self.field_filters))

field_filters = set(self.field_filters)

for entry in data:

yield {k: v for k, v in entry.items() if k in field_filters}

def filter_excluded(self, data):

"""

Yield transformed dicts from a `data` list of dicts excluding

fields with names in the `exclude_fields`of this Transformer.

Return the data unchanged if no `exclude_fields` exists.

"""

# exclude_fields = set(self.clean_fields(self.exclude_fields))

exclude_fields = set(self.exclude_fields)

filtered_list = []

for entry in data:

result = {}

for k, v in entry.items():

if type(v) == list:

result[k] = self.filter_excluded(v)

elif k not in exclude_fields:

result[k] = v

filtered_list.append(result)

# yield result

# yield {k: v for k, v in entry.items() if k not in exclude_fields}

return filtered_list

def check_duplicate_fields(field_names):

"""

Check that there are no duplicate in the `field_names` list of field name

strings, ignoring case. Return a list of unique duplicated field names.

"""

counted = Counter(c.lower() for c in field_names)

return [field for field, count in sorted(counted.items()) if count > 1]

def read_csv_rows(location):

"""

Yield rows (as a list of values) from a CSV file at `location`.

"""

with open(location, encoding="utf-8", errors="replace") as csvfile:

reader = csv.reader(csvfile)

for row in reader:

yield row

def read_json(location):

"""

Yield rows (as a list of values) from a CSV file at `location`.

"""

with open(location, encoding="utf-8", errors="replace") as jsonfile:

return json.load(jsonfile)

def write_csv(location, data):

"""

Write a CSV file at `location` with the `data` which is a list of ordered dicts.

"""

field_names = list(data[0].keys())

with open(location, "w", encoding="utf-8", newline="\n", errors="replace") as csvfile:

writer = csv.DictWriter(csvfile, fieldnames=field_names)

writer.writeheader()

writer.writerows(data)

def write_json(location, data):

"""

Write a JSON file at `location` the `data` list of ordered dicts.

"""

with open(location, "w") as jsonfile:

json.dump(data, jsonfile, indent=3)

def read_excel(location, worksheet=None):

"""

Read XLSX at `location`, return a list of ordered dictionaries, one

for each row.

"""

results = []

errors = []

input_bom = openpyxl.load_workbook(location)

if worksheet:

sheet_obj = input_bom[worksheet]

else:

sheet_obj = input_bom.active

max_col = sheet_obj.max_column

index = 1

col_keys = []

mapping_dict = {}

while index <= max_col:

value = sheet_obj.cell(row=1, column=index).value

if value in col_keys:

msg = "Duplicated column name, " + str(value) + ", detected."

errors.append(Error(CRITICAL, msg))

return errors, results

if value in mapping_dict:

value = mapping_dict[value]

col_keys.append(value)

index = index + 1

for row in sheet_obj.iter_rows(min_row=2, values_only=True):

row_dict = OrderedDict()

index = 0

while index < max_col:

value = row[index]

if value:

row_dict[col_keys[index]] = value

else:

row_dict[col_keys[index]] = ""

index = index + 1

results.append(row_dict)

return errors, results

def write_excel(location, data):

wb = openpyxl.Workbook()

ws = wb.active

# Get the header

headers = list(data[0].keys())

ws.append(headers)

for elements in data:

ws.append([elements.get(h) for h in headers])

wb.save(location)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

FilesExpand file tree

transform.py

Latest commit

History

transform.py

File metadata and controls