feathr/feathr_project/feathr/client.py at main · feathr-ai/feathr

History

1151 lines (1051 loc) · 60.6 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

import base64

import copy

import json

import logging

import os

import tempfile

from typing import Any, Dict, List, Tuple, Union, Set

from azure.identity import DefaultAzureCredential

from jinja2 import Template

from loguru import logger

from pyhocon import ConfigFactory

import redis

from feathr.constants import *

from feathr.definition._materialization_utils import _to_materialization_config

from feathr.definition.anchor import FeatureAnchor

from feathr.definition.config_helper import FeathrConfigHelper

from feathr.definition.feature import FeatureBase

from feathr.definition.feature_derivations import DerivedFeature

from feathr.definition.materialization_settings import MaterializationSettings

from feathr.definition.monitoring_settings import MonitoringSettings

from feathr.definition.query_feature_list import FeatureQuery

from feathr.definition.settings import ObservationSettings, ConflictsAutoCorrection

from feathr.definition.sink import HdfsSink, Sink

from feathr.definition.source import InputContext

from feathr.definition.transformation import WindowAggTransformation

from feathr.definition.typed_key import TypedKey

from feathr.protobuf.featureValue_pb2 import FeatureValue

from feathr.registry._feathr_registry_client import _FeatureRegistry, derived_feature_to_def, feature_to_def

from feathr.registry._feature_registry_purview import _PurviewRegistry

from feathr.spark_provider._databricks_submission import _FeathrDatabricksJobLauncher

from feathr.spark_provider._localspark_submission import _FeathrLocalSparkJobLauncher

from feathr.spark_provider._synapse_submission import _FeathrSynapseJobLauncher

from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration

from feathr.udf._preprocessing_pyudf_manager import _PreprocessingPyudfManager

from feathr.utils._env_config_reader import EnvConfigReader

from feathr.utils._file_utils import write_to_file

from feathr.utils.feature_printer import FeaturePrinter

from feathr.utils.spark_job_params import FeatureGenerationJobParams, FeatureJoinJobParams

from feathr.version import get_version

import importlib.util

class FeathrClient(object):

"""Feathr client.

The client is used to create training dataset, materialize features, register features, and fetch features from

the online storage.

For offline storage and compute engine, Azure ADLS, AWS S3 and Azure Synapse are supported.

For online storage, currently only Redis is supported.

The users of this client is responsible for set up all the necessary information needed to start a Redis client via

environment variable or a Spark cluster. Host address, port and password are needed to start the Redis client.

Raises:

RuntimeError: Fail to create the client since necessary environment variables are not set for Redis

client creation.

"""

def __init__(

self,

config_path: str = "./feathr_config.yaml",

local_workspace_dir: str = None,

credential: Any = None,

project_registry_tag: Dict[str, str] = None,

"""Initialize Feathr Client.

Configuration values used by the Feathr are evaluated in the following precedence, with items higher on the list taking priority.

1. Environment variables

2. Values in the configuration file

3. Values in the Azure Key Vault

Args:

config_path (optional): Config yaml file path. See [Feathr Config Template](https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for more details. Defaults to "./feathr_config.yaml".

local_workspace_dir (optional): Set where is the local work space dir. If not set, Feathr will create a temporary folder to store local workspace related files.

credential (optional): Azure credential to access cloud resources, most likely to be the returned result of DefaultAzureCredential(). If not set, Feathr will initialize DefaultAzureCredential() inside the __init__ function to get credentials.

project_registry_tag (optional): Adding tags for project in Feathr registry. This might be useful if you want to tag your project as deprecated, or allow certain customizations on project level. Default is empty

"""

self.logger = logging.getLogger(__name__)

# Redis key separator

self._KEY_SEPARATOR = ":"

self._COMPOSITE_KEY_SEPARATOR = "#"

self.env_config = EnvConfigReader(config_path=config_path)

if local_workspace_dir:

self.local_workspace_dir = local_workspace_dir

else:

# this is required for Windows

tem_dir_obj = tempfile.TemporaryDirectory()

self.local_workspace_dir = tem_dir_obj.name

if not os.path.exists(config_path):

self.logger.warning(

"No Configuration file exist at the user provided config_path or the default config_path (./feathr_config.yaml), you need to set the environment variables explicitly. For all the environment variables that you need to set, please refer to https://github.com/feathr-ai/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml"

)

# Load all configs from yaml at initialization

# DO NOT load any configs from yaml during runtime.

self.project_name = self.env_config.get("project_config__project_name")

# Redis configs. This is optional unless users have configured Redis host.

if self.env_config.get("online_store__redis__host"):

# For illustrative purposes.

spec = importlib.util.find_spec("redis")

if spec is None:

self.logger.warning(

'You have configured Redis host, but there is no local Redis client package. Install the package using "pip install redis". '

)

self.redis_host = self.env_config.get("online_store__redis__host")

self.redis_port = self.env_config.get("online_store__redis__port")

self.redis_ssl_enabled = self.env_config.get("online_store__redis__ssl_enabled")

self._construct_redis_client()

# Offline store enabled configs; false by default

self.s3_enabled = self.env_config.get("offline_store__s3__s3_enabled")

self.adls_enabled = self.env_config.get("offline_store__adls__adls_enabled")

self.wasb_enabled = self.env_config.get("offline_store__wasb__wasb_enabled")

self.jdbc_enabled = self.env_config.get("offline_store__jdbc__jdbc_enabled")

self.snowflake_enabled = self.env_config.get("offline_store__snowflake__snowflake_enabled")

if not (

self.s3_enabled or self.adls_enabled or self.wasb_enabled or self.jdbc_enabled or self.snowflake_enabled

self.logger.warning("No offline storage enabled.")

# S3 configs

if self.s3_enabled:

self.s3_endpoint = self.env_config.get("offline_store__s3__s3_endpoint")

# spark configs

self.output_num_parts = self.env_config.get("spark_config__spark_result_output_parts")

self.spark_runtime = self.env_config.get("spark_config__spark_cluster")

self.credential = credential

if self.spark_runtime not in {"azure_synapse", "databricks", "local"}:

raise RuntimeError(

f"{self.spark_runtime} is not supported. Only 'azure_synapse', 'databricks' and 'local' are currently supported."

)

elif self.spark_runtime == "azure_synapse":

# Feathr is a spark-based application so the feathr jar compiled from source code will be used in the

# Spark job submission. The feathr jar hosted in cloud saves the time users needed to upload the jar from

# their local env.

self._FEATHR_JOB_JAR_PATH = self.env_config.get("spark_config__azure_synapse__feathr_runtime_location")

if self.credential is None:

self.credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)

self.feathr_spark_launcher = _FeathrSynapseJobLauncher(

synapse_dev_url=self.env_config.get("spark_config__azure_synapse__dev_url"),

pool_name=self.env_config.get("spark_config__azure_synapse__pool_name"),

datalake_dir=self.env_config.get("spark_config__azure_synapse__workspace_dir"),

executor_size=self.env_config.get("spark_config__azure_synapse__executor_size"),

executors=self.env_config.get("spark_config__azure_synapse__executor_num"),

credential=self.credential,

)

elif self.spark_runtime == "databricks":

# Feathr is a spark-based application so the feathr jar compiled from source code will be used in the

# Spark job submission. The feathr jar hosted in cloud saves the time users needed to upload the jar from

# their local env.

self._FEATHR_JOB_JAR_PATH = self.env_config.get("spark_config__databricks__feathr_runtime_location")

self.feathr_spark_launcher = _FeathrDatabricksJobLauncher(

workspace_instance_url=self.env_config.get("spark_config__databricks__workspace_instance_url"),

token_value=self.env_config.get_from_env_or_akv("DATABRICKS_WORKSPACE_TOKEN_VALUE"),

config_template=self.env_config.get("spark_config__databricks__config_template"),

databricks_work_dir=self.env_config.get("spark_config__databricks__work_dir"),

)

elif self.spark_runtime == "local":

self._FEATHR_JOB_JAR_PATH = self.env_config.get("spark_config__local__feathr_runtime_location")

self.feathr_spark_launcher = _FeathrLocalSparkJobLauncher(

workspace_path=self.env_config.get("spark_config__local__workspace"),

master=self.env_config.get("spark_config__local__master"),

)

self.secret_names = []

# initialize config helper

self.config_helper = FeathrConfigHelper()

# initialize registry

self.registry = None

registry_endpoint = self.env_config.get("feature_registry__api_endpoint")

azure_purview_name = self.env_config.get("feature_registry__purview__purview_name")

if registry_endpoint:

self.registry = _FeatureRegistry(

self.project_name, endpoint=registry_endpoint, project_tags=project_registry_tag, credential=credential

)

elif azure_purview_name:

registry_delimiter = self.env_config.get("feature_registry__purview__delimiter")

# initialize the registry no matter whether we set purview name or not, given some of the methods are used there.

self.registry = _PurviewRegistry(

self.project_name,

azure_purview_name,

registry_delimiter,

project_registry_tag,

config_path=config_path,

credential=credential,

)

logger.warning(

"FEATURE_REGISTRY__PURVIEW__PURVIEW_NAME will be deprecated soon. Please use FEATURE_REGISTRY__API_ENDPOINT instead."

)

else:

# no registry configured

logger.info(

"Feathr registry is not configured. Consider setting the Feathr registry component for richer feature store experience."

)

logger.info(f"Feathr client {get_version()} initialized successfully.")

def _check_required_environment_variables_exist(self):

"""Checks if the required environment variables(form feathr_config.yaml) is set.

Some required information has to be set via environment variables so the client can work.

"""

props = self.secret_names

for required_field in self.required_fields + props:

if required_field not in os.environ:

raise RuntimeError(

f"{required_field} is not set in environment variable. All required environment "

f"variables are: {self.required_fields}."

)

def register_features(self, from_context: bool = True):

"""Registers features based on the current workspace

Args:

from_context: If from_context is True (default), the features will be generated from the current context, with the previous built features in client.build(). Otherwise, the features will be generated from

configuration files.

"""

if from_context:

# make sure those items are in `self`

if "anchor_list" in dir(self) and "derived_feature_list" in dir(self):

self.config_helper.save_to_feature_config_from_context(

self.anchor_list, self.derived_feature_list, self.local_workspace_dir

)

self.registry.register_features(

self.local_workspace_dir,

from_context=from_context,

anchor_list=self.anchor_list,

derived_feature_list=self.derived_feature_list,

)

else:

raise RuntimeError("Please call FeathrClient.build_features() first in order to register features")

else:

self.registry.register_features(self.local_workspace_dir, from_context=from_context)

def build_features(

self,

anchor_list: List[FeatureAnchor] = [],

derived_feature_list: List[DerivedFeature] = [],

verbose: bool = False,

"""Build features based on the current workspace. all actions that triggers a spark job will be based on the

result of this action.

"""

# Run necessary validations

# anchor name and source name should be unique

anchor_names = {}

source_names = {}

for anchor in anchor_list:

if anchor.name in anchor_names:

raise RuntimeError(

f"Anchor name should be unique but there are duplicate anchor names in your anchor "

f"definitions. Anchor name of {anchor} is already defined in {anchor_names[anchor.name]}"

)

else:

anchor_names[anchor.name] = anchor

if anchor.source.name in source_names and (anchor.source is not source_names[anchor.source.name]):

raise RuntimeError(

f"Source name should be unique but there are duplicate source names in your source "

f"definitions. Source name of {anchor.source} is already defined in {source_names[anchor.source.name]}"

)

else:

source_names[anchor.source.name] = anchor.source

_PreprocessingPyudfManager.build_anchor_preprocessing_metadata(anchor_list, self.local_workspace_dir)

self.config_helper.save_to_feature_config_from_context(

anchor_list, derived_feature_list, self.local_workspace_dir

)

self.anchor_list = anchor_list

self.derived_feature_list = derived_feature_list

# Check if data source used by every anchor requires additional system properties to be set

props = []

for anchor in self.anchor_list:

if hasattr(anchor.source, "get_required_properties"):

props.extend(anchor.source.get_required_properties())

# Reset `system_properties`

self.secret_names = props

# Pretty print anchor_list

if verbose and self.anchor_list:

FeaturePrinter.pretty_print_anchors(self.anchor_list)

def get_snowflake_path(self, database: str, schema: str, dbtable: str = None, query: str = None) -> str:

"""

Returns snowflake path given dataset location information.

Either dbtable or query must be specified but not both.

"""

if dbtable is not None and query is not None:

raise RuntimeError("Both dbtable and query are specified. Can only specify one..")

if dbtable is None and query is None:

raise RuntimeError("One of dbtable or query must be specified..")

if dbtable:

return f"snowflake://snowflake_account/?sfDatabase={database}&sfSchema={schema}&dbtable={dbtable}"

else:

return f"snowflake://snowflake_account/?sfDatabase={database}&sfSchema={schema}&query={query}"

def list_registered_features(self, project_name: str = None) -> List[str]:

"""List all the already registered features under the given project.

`project_name` must not be None or empty string because it violates the RBAC policy

"""

return self.registry.list_registered_features(project_name)

def list_dependent_entities(self, qualified_name: str):

"""

Lists all dependent/downstream entities for a given entity

"""

return self.registry.list_dependent_entities(qualified_name)

def delete_entity(self, qualified_name: str):

"""

Deletes a single entity if it has no downstream/dependent entities

"""

return self.registry.delete_entity(qualified_name)

def _get_registry_client(self):

"""

Returns registry client in case users want to perform more advanced operations

"""

return self.registry._get_registry_client()

def get_online_features(self, feature_table: str, key: Any, feature_names: List[str]):

"""Fetches feature value for a certain key from a online feature table.

Args:

feature_table: the name of the feature table.

key: the key/key list of the entity;

for key list, please make sure the order is consistent with the one in feature's definition;

the order can be found by 'get_features_from_registry'.

feature_names: list of feature names to fetch

Return:

A list of feature values for this entity. It's ordered by the requested feature names.

For example, feature_names = ['f_is_medium_trip_distance', 'f_day_of_week', 'f_day_of_month', 'f_hour_of_day']

then, the returned feature values is: [b'true', b'4.0', b'31.0', b'23.0'].

If the feature_table or key doesn't exist, then a list of Nones are returned. For example,

[None, None, None, None].

If a feature doesn't exist, then a None is returned for that feature. For example:

[None, b'4.0', b'31.0', b'23.0'].

"""

redis_key = self._construct_redis_key(feature_table, key)

res = self.redis_client.hmget(redis_key, *feature_names)

return self._decode_proto(res)

def multi_get_online_features(self, feature_table: str, keys: List[Any], feature_names: List[str]):

"""Fetches feature value for a list of keys from a online feature table. This is the batch version of the get API.

Args:

feature_table: the name of the feature table.

keys: list of keys/composite keys for the entities;

for composite keys, please make sure each order of them is consistent with the one in feature's definition;

the order can be found by 'get_features_from_registry'.

feature_names: list of feature names to fetch

Return:

A list of feature values for the requested entities. It's ordered by the requested feature names. For

example, keys = [12, 24], feature_names = ['f_is_medium_trip_distance', 'f_day_of_week', 'f_day_of_month',

'f_hour_of_day'] then, the returned feature values is: {'12': [b'false', b'5.0', b'1.0', b'0.0'],

'24': [b'true', b'4.0', b'31.0', b'23.0']}. If the feature_table or key doesn't exist, then a list of Nones

are returned. For example, {'12': [None, None, None, None], '24': [None, None, None, None]} If a feature

doesn't exist, then a None is returned for that feature. For example: {'12': [None, b'4.0', b'31.0',

b'23.0'], '24': [b'true', b'4.0', b'31.0', b'23.0']}.

"""

with self.redis_client.pipeline() as redis_pipeline:

for key in keys:

redis_key = self._construct_redis_key(feature_table, key)

redis_pipeline.hmget(redis_key, *feature_names)

pipeline_result = redis_pipeline.execute()

decoded_pipeline_result = []

for feature_list in pipeline_result:

decoded_pipeline_result.append(self._decode_proto(feature_list))

for i in range(len(keys)):

if isinstance(keys[i], List):

keys[i] = self._COMPOSITE_KEY_SEPARATOR.join(keys[i])

return dict(zip(keys, decoded_pipeline_result))

def _decode_proto(self, feature_list):

"""Decode the bytes(in string form) via base64 decoder. For dense array, it will be returned as Python List.

For sparse array, it will be returned as tuple of index array and value array. The order of elements in the

arrays won't be changed.

"""

typed_result = []

for raw_feature in feature_list:

if raw_feature:

feature_value = FeatureValue()

decoded = base64.b64decode(raw_feature)

feature_value.ParseFromString(decoded)

if feature_value.WhichOneof("FeatureValueOneOf") == "boolean_value":

typed_result.append(feature_value.boolean_value)

elif feature_value.WhichOneof("FeatureValueOneOf") == "string_value":

typed_result.append(feature_value.string_value)

elif feature_value.WhichOneof("FeatureValueOneOf") == "float_value":

typed_result.append(feature_value.float_value)

elif feature_value.WhichOneof("FeatureValueOneOf") == "double_value":

typed_result.append(feature_value.double_value)

elif feature_value.WhichOneof("FeatureValueOneOf") == "int_value":

typed_result.append(feature_value.int_value)

elif feature_value.WhichOneof("FeatureValueOneOf") == "long_value":

typed_result.append(feature_value.long_value)

elif feature_value.WhichOneof("FeatureValueOneOf") == "int_array":

typed_result.append(feature_value.int_array.integers)

elif feature_value.WhichOneof("FeatureValueOneOf") == "string_array":

typed_result.append(feature_value.string_array.strings)

elif feature_value.WhichOneof("FeatureValueOneOf") == "float_array":

typed_result.append(feature_value.float_array.floats)

elif feature_value.WhichOneof("FeatureValueOneOf") == "double_array":

typed_result.append(feature_value.double_array.doubles)

elif feature_value.WhichOneof("FeatureValueOneOf") == "boolean_array":

typed_result.append(feature_value.boolean_array.booleans)

elif feature_value.WhichOneof("FeatureValueOneOf") == "sparse_string_array":

typed_result.append(

(

feature_value.sparse_string_array.index_integers,

feature_value.sparse_string_array.value_strings,

)

elif feature_value.WhichOneof("FeatureValueOneOf") == "sparse_bool_array":

typed_result.append(

(feature_value.sparse_bool_array.index_integers, feature_value.sparse_bool_array.value_booleans)

)

elif feature_value.WhichOneof("FeatureValueOneOf") == "sparse_float_array":

typed_result.append(

(feature_value.sparse_float_array.index_integers, feature_value.sparse_float_array.value_floats)

)

elif feature_value.WhichOneof("FeatureValueOneOf") == "sparse_double_array":

typed_result.append(

(

feature_value.sparse_double_array.index_integers,

feature_value.sparse_double_array.value_doubles,

)

elif feature_value.WhichOneof("FeatureValueOneOf") == "sparse_long_array":

typed_result.append(

(feature_value.sparse_long_array.index_integers, feature_value.sparse_long_array.value_longs)

)

else:

self.logger.debug(

"Fail to load the feature type. Maybe a new type that is not supported by this "

"client version"

)

self.logger.debug(f"The raw feature is {raw_feature}.")

self.logger.debug(f"The loaded feature is {feature_value}")

typed_result.append(None)

else:

typed_result.append(raw_feature)

return typed_result

def delete_feature_from_redis(self, feature_table, key, feature_name) -> None:

"""

Delete feature from Redis

Args:

feature_table: the name of the feature table

key: the key of the entity

feature_name: feature name to be deleted

"""

redis_key = self._construct_redis_key(feature_table, key)

if self.redis_client.hexists(redis_key, feature_name):

self.redis_client.delete(redis_key, feature_name)

print(f"Deletion successful. {feature_name} is deleted from Redis.")

else:

raise RuntimeError(f"Deletion failed. {feature_name} not found in Redis.")

def _clean_test_data(self, feature_table):

"""

WARNING: THIS IS ONLY USED FOR TESTING

Clears a namespace in redis cache.

This may be very time consuming.

Args:

feature_table: str, feature_table i.e your prefix before the separator in the Redis database.

"""

cursor = "0"

ns_keys = feature_table + "*"

while cursor != 0:

# 5000 count at a scan seems reasonable faster for our testing data

cursor, keys = self.redis_client.scan(cursor=cursor, match=ns_keys, count=5000)

if keys:

self.redis_client.delete(*keys)

def _construct_redis_key(self, feature_table, key):

if isinstance(key, List):

key = self._COMPOSITE_KEY_SEPARATOR.join(key)

return feature_table + self._KEY_SEPARATOR + key

def _str_to_bool(self, s: str, variable_name=None):

"""Define a function to detect convert string to bool, since Redis client sometimes require a bool and sometimes require a str"""

if (isinstance(s, str) and s.casefold() == "True".casefold()) or s == True:

return True

elif (isinstance(s, str) and s.casefold() == "False".casefold()) or s == False:

return False

else:

self.logger.warning(

f"{s} is not a valid Bool value. Maybe you want to double check if it is set correctly for {variable_name}."

)

return s

def _construct_redis_client(self):

"""Constructs the Redis client. The host, port, credential and other parameters can be set via environment

parameters.

"""

password = self.env_config.get_from_env_or_akv(REDIS_PASSWORD)

host = self.redis_host

port = self.redis_port

ssl_enabled = self.redis_ssl_enabled

self.redis_client = redis.Redis(

host=host, port=port, password=password, ssl=self._str_to_bool(ssl_enabled, "ssl_enabled")

)

self.logger.info("Redis connection is successful and completed.")

def get_offline_features(

self,

observation_settings: ObservationSettings,

feature_query: Union[FeatureQuery, List[FeatureQuery]],

output_path: Union[str, Sink],

execution_configurations: Union[SparkExecutionConfiguration, Dict[str, str]] = {},

config_file_name: str = "feature_join_conf/feature_join.conf",

dataset_column_names: Set[str] = None,

verbose: bool = False,

"""

Get offline features for the observation dataset

Args:

observation_settings: settings of the observation data, e.g. timestamp columns, input path, etc.

feature_query: features that are requested to add onto the observation data

output_path: output path of job, i.e. the observation data with features attached.

execution_configurations: a dict that will be passed to spark job when the job starts up, i.e. the "spark configurations". Note that not all of the configuration will be honored since some of the configurations are managed by the Spark platform, such as Databricks or Azure Synapse. Refer to the [spark documentation](https://spark.apache.org/docs/latest/configuration.html) for a complete list of spark configurations.

config_file_name: the name of the config file that will be passed to the spark job. The config file is used to configure the spark job. The default value is "feature_join_conf/feature_join.conf".

dataset_column_names: column names of observation data set. Will be used to check conflicts with feature names if cannot get real column names from observation data set.

"""

feature_queries = feature_query if isinstance(feature_query, List) else [feature_query]

feature_names = []

for feature_query in feature_queries:

for feature_name in feature_query.feature_list:

feature_names.append(feature_name)

if len(feature_names) > 0 and observation_settings.conflicts_auto_correction is None:

import feathr.utils.job_utils as job_utils

dataset_column_names_from_path = job_utils.get_cloud_file_column_names(

self,

observation_settings.observation_path,

observation_settings.file_format,

observation_settings.is_file_path,

)

if (

dataset_column_names_from_path is None or len(dataset_column_names_from_path) == 0

) and dataset_column_names is None:

self.logger.warning(

f"Feathr is unable to read the Observation data from {observation_settings.observation_path} due to permission issue or invalid path. Please either grant the permission or supply the observation column names in the filed: observation_column_names."

)

else:

if dataset_column_names_from_path is not None and len(dataset_column_names_from_path) > 0:

dataset_column_names = dataset_column_names_from_path

conflict_names = []

for feature_name in feature_names:

if feature_name in dataset_column_names:

conflict_names.append(feature_name)

if len(conflict_names) != 0:

conflict_names = ",".join(conflict_names)

raise RuntimeError(f"Feature names exist conflicts with dataset column names: {conflict_names}")

udf_files = _PreprocessingPyudfManager.prepare_pyspark_udf_files(feature_names, self.local_workspace_dir)

# produce join config

tm = Template(

"""

featureList: [

{% for list in feature_lists %}

{% endfor %}

]

outputPath: "{{output_path}}"

"""

)

config = tm.render(

feature_lists=feature_queries, observation_settings=observation_settings, output_path=output_path

)

config_file_path = os.path.join(self.local_workspace_dir, config_file_name)

# make sure `FeathrClient.build_features()` is called before getting offline features/materialize features

# otherwise users will be confused on what are the available features

# in build_features it will assign anchor_list and derived_feature_list variable, hence we are checking if those two variables exist to make sure the above condition is met

if "anchor_list" in dir(self) and "derived_feature_list" in dir(self):

self.config_helper.save_to_feature_config_from_context(

self.anchor_list, self.derived_feature_list, self.local_workspace_dir

)

else:

raise RuntimeError("Please call FeathrClient.build_features() first in order to get offline features")

# Pretty print feature_query

if verbose and feature_query:

FeaturePrinter.pretty_print_feature_query(feature_query)

write_to_file(content=config, full_file_name=config_file_path)

return self._get_offline_features_with_config(

config_file_path,

output_path=output_path,

execution_configurations=execution_configurations,

udf_files=udf_files,

)

def _get_offline_features_with_config(

self,

feature_join_conf_path="feature_join_conf/feature_join.conf",

output_path: Union[str, Sink] = "",

execution_configurations: Dict[str, str] = {},

udf_files=[],

"""Joins the features to your offline observation dataset based on the join config.

Args:

feature_join_conf_path: Relative path to your feature join config file.

"""

cloud_udf_paths = [

self.feathr_spark_launcher.upload_or_get_cloud_path(udf_local_path) for udf_local_path in udf_files

]

feathr_feature = ConfigFactory.parse_file(feature_join_conf_path)

feature_join_job_params = FeatureJoinJobParams(

join_config_path=os.path.abspath(feature_join_conf_path),

observation_path=feathr_feature["observationPath"],

feature_config=os.path.join(self.local_workspace_dir, "feature_conf/"),

job_output_path=output_path,

)

job_tags = {OUTPUT_PATH_TAG: feature_join_job_params.job_output_path}

# set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function

if execution_configurations is not None and OUTPUT_FORMAT in execution_configurations:

job_tags[OUTPUT_FORMAT] = execution_configurations[OUTPUT_FORMAT]

else:

job_tags[OUTPUT_FORMAT] = "avro"

"""

- Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself.

- Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to

- Job configuration are like "configurations" for the spark job and are usually spark specific. For example, we want to control the no. of write parts for spark

Job configurations and job arguments (or sometimes called job parameters) have quite some overlaps (i.e. you can achieve the same goal by either using the job arguments/parameters vs. job configurations). But the job tags should just be used for metadata purpose.

"""

# submit the jars

return self.feathr_spark_launcher.submit_feathr_job(

job_name=self.project_name + "_feathr_feature_join_job",

main_jar_path=self._FEATHR_JOB_JAR_PATH,

python_files=cloud_udf_paths,

job_tags=job_tags,

main_class_name=JOIN_CLASS_NAME,

arguments=[

"--join-config",

self.feathr_spark_launcher.upload_or_get_cloud_path(feature_join_job_params.join_config_path),

"--input",

feature_join_job_params.observation_path,

"--output",

feature_join_job_params.job_output_path,

"--feature-config",

self.feathr_spark_launcher.upload_or_get_cloud_path(feature_join_job_params.feature_config),

"--num-parts",

self.output_num_parts,

]

+ self._get_offline_storage_arguments(),

reference_files_path=[],

configuration=execution_configurations,

properties=self._collect_secrets(feature_join_job_params.secrets),

)

def _get_offline_storage_arguments(self):

arguments = []

if self.s3_enabled:

arguments.append("--s3-config")

arguments.append(self._get_s3_config_str())

if self.adls_enabled:

arguments.append("--adls-config")

arguments.append(self._get_adls_config_str())

if self.wasb_enabled:

arguments.append("--blob-config")

arguments.append(self._get_blob_config_str())

if self.jdbc_enabled:

arguments.append("--sql-config")

arguments.append(self._get_sql_config_str())

if self.snowflake_enabled:

arguments.append("--snowflake-config")

arguments.append(self._get_snowflake_config_str())

return arguments

def get_job_result_uri(self, block=True, timeout_sec=300) -> str:

"""Gets the job output URI"""

if not block:

return self.feathr_spark_launcher.get_job_result_uri()

# Block the API by pooling the job status and wait for complete

if self.feathr_spark_launcher.wait_for_completion(timeout_sec):

return self.feathr_spark_launcher.get_job_result_uri()

else:

raise RuntimeError("Spark job failed so output cannot be retrieved.")

def get_job_tags(self) -> Dict[str, str]:

"""Gets the job tags"""

return self.feathr_spark_launcher.get_job_tags()

def wait_job_to_finish(self, timeout_sec: int = 300):

"""Waits for the job to finish in a blocking way unless it times out"""

if self.feathr_spark_launcher.wait_for_completion(timeout_sec):

return

else:

raise RuntimeError("Spark job failed.")

def monitor_features(

self,

settings: MonitoringSettings,

execution_configurations: Union[SparkExecutionConfiguration, Dict[str, str]] = {},

verbose: bool = False,

"""Create a offline job to generate statistics to monitor feature data

Args:

settings: Feature monitoring settings

"""

self.materialize_features(settings, execution_configurations, verbose)

# Get feature keys given the name of a feature

# Should search in both 'derived_feature_list' and 'anchor_list'

# Return related keys(key_column list) or None if cannot find the feature

def _get_feature_key(self, feature_name: str):

features = []

if "derived_feature_list" in dir(self):

features += self.derived_feature_list

if "anchor_list" in dir(self):

for anchor in self.anchor_list:

features += anchor.features

for feature in features:

if feature.name == feature_name:

keys = feature.key

return set(key.key_column for key in keys)

self.logger.warning(

f"Invalid feature name: {feature_name}. Please call FeathrClient.build_features() first in order to materialize the features."

)

return None

# Validation on feature keys:

# Features within a set of aggregation or planned to be merged should have same keys

# The param "allow_empty_key" shows if empty keys are acceptable

def _valid_materialize_keys(self, features: List[str], allow_empty_key=False):

keys = None

for feature in features:

new_keys = self._get_feature_key(feature)

if new_keys is None:

self.logger.error(

f"Key of feature: {feature} is empty. Please confirm the feature is defined. In addition, if this feature is not from INPUT_CONTEXT, you might want to double check on the feature definition to see whether the key is empty or not."

)

return False

# If only get one key and it's "NOT_NEEDED", it means the feature has an empty key.

if ",".join(new_keys) == "NOT_NEEDED" and not allow_empty_key:

self.logger.error(f"Empty feature key is not allowed for features: {features}")

return False

if keys is None:

keys = copy.deepcopy(new_keys)

else:

if len(keys) != len(new_keys):

self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}")

return False

for new_key in new_keys:

if new_key not in keys:

self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}")

return False

return True

def materialize_features(

self,

settings: MaterializationSettings,

execution_configurations: Union[SparkExecutionConfiguration, Dict[str, str]] = {},

verbose: bool = False,

allow_materialize_non_agg_feature: bool = False,

"""Materialize feature data

Args:

settings: Feature materialization settings

allow_materialize_non_agg_feature: Materializing non-aggregated features (the features without WindowAggTransformation) doesn't output meaningful results so it's by default set to False, but if you really want to materialize non-aggregated features, set this to True.

"""

feature_list = settings.feature_names

if len(feature_list) > 0:

if "anchor_list" in dir(self):

anchors = [anchor for anchor in self.anchor_list if isinstance(anchor.source, InputContext)]

anchor_feature_names = set(feature.name for anchor in anchors for feature in anchor.features)

for feature in feature_list:

if feature in anchor_feature_names:

raise RuntimeError(

f"Materializing features that are defined on INPUT_CONTEXT is not supported. {feature} is defined on INPUT_CONTEXT so you should remove it from the feature list in MaterializationSettings."

)

if not self._valid_materialize_keys(feature_list):

raise RuntimeError(

f"Invalid materialization features: {feature_list}, since they have different keys or they are not defined. Currently Feathr only supports materializing features of the same keys."

)

if not allow_materialize_non_agg_feature:

# Check if there are non-aggregation features in the list

for fn in feature_list:

# Check over anchor features

for anchor in self.anchor_list:

for feature in anchor.features:

if feature.name == fn and not isinstance(feature.transform, WindowAggTransformation):

raise RuntimeError(

f"Feature {fn} is not an aggregation feature. Currently Feathr only supports materializing aggregation features. If you want to materialize {fn}, please set allow_materialize_non_agg_feature to True."

)

# Check over derived features

for feature in self.derived_feature_list:

if feature.name == fn and not isinstance(feature.transform, WindowAggTransformation):

raise RuntimeError(

)

# Collect secrets from sinks. Get output_path as well if the sink is offline sink (HdfsSink) for later use.

secrets = []

output_path = None

for sink in settings.sinks:

if hasattr(sink, "get_required_properties"):

secrets.extend(sink.get_required_properties())

if isinstance(sink, HdfsSink):

# Note, for now we only cache one output path from one of HdfsSinks (if one passed multiple sinks).

output_path = sink.output_path

results = []

# produce materialization config

for end in settings.get_backfill_cutoff_time():

settings.backfill_time.end = end

config = _to_materialization_config(settings)

config_file_name = "feature_gen_conf/auto_gen_config_{}.conf".format(end.timestamp())

config_file_path = os.path.join(self.local_workspace_dir, config_file_name)

write_to_file(content=config, full_file_name=config_file_path)

# make sure `FeathrClient.build_features()` is called before getting offline features/materialize features in the python SDK

# otherwise users will be confused on what are the available features

# in build_features it will assign anchor_list and derived_feature_list variable, hence we are checking if those two variables exist to make sure the above condition is met

if "anchor_list" in dir(self) and "derived_feature_list" in dir(self):

self.config_helper.save_to_feature_config_from_context(

self.anchor_list, self.derived_feature_list, self.local_workspace_dir

)

else:

raise RuntimeError(

"Please call FeathrClient.build_features() first in order to materialize the features"

)

udf_files = _PreprocessingPyudfManager.prepare_pyspark_udf_files(

settings.feature_names, self.local_workspace_dir

)

# CLI will directly call this so the experience won't be broken

result = self._materialize_features_with_config(

feature_gen_conf_path=config_file_path,

execution_configurations=execution_configurations,

udf_files=udf_files,

secrets=secrets,

output_path=output_path,

)

if os.path.exists(config_file_path) and self.spark_runtime != "local":

os.remove(config_file_path)

results.append(result)

# Pretty print feature_names of materialized features

if verbose and settings:

FeaturePrinter.pretty_print_materialize_features(settings)

return results

def _materialize_features_with_config(

self,

feature_gen_conf_path: str = "feature_gen_conf/feature_gen.conf",

execution_configurations: Dict[str, str] = {},

udf_files: List = [],

secrets: List = [],

output_path: str = None,

"""Materializes feature data based on the feature generation config. The feature

data will be materialized to the destination specified in the feature generation config.

Args

feature_gen_conf_path: Relative path to the feature generation config you want to materialize.

execution_configurations: Spark job execution configurations.

udf_files: UDF files.

secrets: Secrets to access sinks.

output_path: The output path of the materialized features when using an offline sink.

"""

cloud_udf_paths = [

self.feathr_spark_launcher.upload_or_get_cloud_path(udf_local_path) for udf_local_path in udf_files

]

# Read all features conf

generation_config = FeatureGenerationJobParams(

generation_config_path=os.path.abspath(feature_gen_conf_path),

feature_config=os.path.join(self.local_workspace_dir, "feature_conf/"),

)

# When using offline sink (i.e. output_path is not None)

job_tags = {}

if output_path:

job_tags[OUTPUT_PATH_TAG] = output_path

# set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function

if execution_configurations is not None and OUTPUT_FORMAT in execution_configurations:

job_tags[OUTPUT_FORMAT] = execution_configurations[OUTPUT_FORMAT]

else:

job_tags[OUTPUT_FORMAT] = "avro"

"""

- Job configuration are like "configurations" for the spark job and are usually spark specific. For example, we want to control the no. of write parts for spark

"""

optional_params = []

if self.env_config.get_from_env_or_akv("KAFKA_SASL_JAAS_CONFIG"):

optional_params = optional_params + ["--kafka-config", self._get_kafka_config_str()]

arguments = (

[

"--generation-config",

self.feathr_spark_launcher.upload_or_get_cloud_path(generation_config.generation_config_path),

# Local Config, comma seperated file names

"--feature-config",

self.feathr_spark_launcher.upload_or_get_cloud_path(generation_config.feature_config),

"--redis-config",

self._getRedisConfigStr(),

]

+ self._get_offline_storage_arguments()

+ optional_params

)

monitoring_config_str = self._get_monitoring_config_str()

if monitoring_config_str:

arguments.append("--monitoring-config")

arguments.append(monitoring_config_str)

return self.feathr_spark_launcher.submit_feathr_job(

job_name=self.project_name + "_feathr_feature_materialization_job",

main_jar_path=self._FEATHR_JOB_JAR_PATH,

python_files=cloud_udf_paths,

job_tags=job_tags,

main_class_name=GEN_CLASS_NAME,

arguments=arguments,

reference_files_path=[],

configuration=execution_configurations,

properties=self._collect_secrets(secrets),

)

def wait_job_to_finish(self, timeout_sec: int = 300):

"""Waits for the job to finish in a blocking way unless it times out"""

if self.feathr_spark_launcher.wait_for_completion(timeout_sec):

return

else:

raise RuntimeError("Spark job failed.")

def _getRedisConfigStr(self):

"""Construct the Redis config string. The host, port, credential and other parameters can be set via environment

variables."""

password = self.env_config.get_from_env_or_akv(REDIS_PASSWORD)

host = self.redis_host

port = self.redis_port

ssl_enabled = self.redis_ssl_enabled

config_str = """

REDIS_PASSWORD: "{REDIS_PASSWORD}"

REDIS_HOST: "{REDIS_HOST}"

REDIS_PORT: {REDIS_PORT}

REDIS_SSL_ENABLED: {REDIS_SSL_ENABLED}

""".format(

REDIS_PASSWORD=password, REDIS_HOST=host, REDIS_PORT=port, REDIS_SSL_ENABLED=str(ssl_enabled)

)

return self._reshape_config_str(config_str)

def _get_s3_config_str(self):

"""Construct the S3 config string. The endpoint, access key, secret key, and other parameters can be set via

environment variables."""

endpoint = self.s3_endpoint

# if s3 endpoint is set in the feathr_config, then we need other environment variables

# keys can't be only accessed through environment

access_key = self.env_config.get_from_env_or_akv("S3_ACCESS_KEY")

secret_key = self.env_config.get_from_env_or_akv("S3_SECRET_KEY")

# HOCON format will be parsed by the Feathr job

config_str = """

S3_ENDPOINT: {S3_ENDPOINT}

S3_ACCESS_KEY: "{S3_ACCESS_KEY}"

S3_SECRET_KEY: "{S3_SECRET_KEY}"

""".format(

S3_ENDPOINT=endpoint, S3_ACCESS_KEY=access_key, S3_SECRET_KEY=secret_key

)

return self._reshape_config_str(config_str)

def _get_adls_config_str(self):

"""Construct the ADLS config string for abfs(s). The Account, access key and other parameters can be set via

environment variables."""

account = self.env_config.get_from_env_or_akv("ADLS_ACCOUNT")

# if ADLS Account is set in the feathr_config, then we need other environment variables

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

client.py

Latest commit

History

client.py

File metadata and controls