refactor!: move model optional args to kwargs (#381)

GarrettWu · web-flow · commit 4037992b61ff · 2024-02-21T20:42:15.000Z
To be more like sklearn, and make API more accurate. Those param shouldn't be called through positions.
diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py
@@ -127,6 +127,10 @@ def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T:
         self._bqml_model.register(vertex_ai_model_id)
         return self
 
+    @abc.abstractmethod
+    def to_gbq(self, model_name, replace):
+        pass
+
 
 class TrainablePredictor(Predictor):
     """A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs.
@@ -141,11 +145,6 @@ def _fit(self, X, y, transforms=None):
     def score(self, X, y):
         pass
 
-    # TODO(b/291812029): move to Predictor after implement in LLM and imported models
-    @abc.abstractmethod
-    def to_gbq(self, model_name, replace):
-        pass
-
 
 class SupervisedTrainablePredictor(TrainablePredictor):
     """A BigQuery DataFrames ML Supervised Model base class that can be used to fit and predict outputs.
@@ -165,7 +164,7 @@ def fit(
 class UnsupervisedTrainablePredictor(TrainablePredictor):
     """A BigQuery DataFrames ML Unsupervised Model base class that can be used to fit and predict outputs.
 
-    Only need to provide both X (y is optional and ignored) in unsupervised tasks."""
+    Only need to provide X (y is optional and ignored) in unsupervised tasks."""
 
     _T = TypeVar("_T", bound="UnsupervisedTrainablePredictor")
 
diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py
@@ -58,6 +58,7 @@ class XGBRegressor(
     def __init__(
         self,
         num_parallel_tree: int = 1,
+        *,
         booster: Literal["gbtree", "dart"] = "gbtree",
         dart_normalized_type: Literal["tree", "forest"] = "tree",
         tree_method: Literal["auto", "exact", "approx", "hist"] = "auto",
@@ -215,6 +216,7 @@ class XGBClassifier(
     def __init__(
         self,
         num_parallel_tree: int = 1,
+        *,
         booster: Literal["gbtree", "dart"] = "gbtree",
         dart_normalized_type: Literal["tree", "forest"] = "tree",
         tree_method: Literal["auto", "exact", "approx", "hist"] = "auto",
@@ -372,6 +374,7 @@ class RandomForestRegressor(
     def __init__(
         self,
         num_parallel_tree: int = 100,
+        *,
         tree_method: Literal["auto", "exact", "approx", "hist"] = "auto",
         min_tree_child_weight: int = 1,
         colsample_bytree=1.0,
@@ -538,6 +541,7 @@ class RandomForestClassifier(
     def __init__(
         self,
         num_parallel_tree: int = 100,
+        *,
         tree_method: Literal["auto", "exact", "approx", "hist"] = "auto",
         min_tree_child_weight: int = 1,
         colsample_bytree: float = 1.0,
diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
@@ -87,7 +87,7 @@ def _fit(
         )
 
     def predict(
-        self, X=None, horizon: int = 3, confidence_level: float = 0.95
+        self, X=None, *, horizon: int = 3, confidence_level: float = 0.95
     ) -> bpd.DataFrame:
         """Predict the closest cluster for each sample in X.
 
diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py
@@ -32,15 +32,17 @@ class TensorFlowModel(base.Predictor):
     """Imported TensorFlow model.
 
     Args:
+        model_path (str):
+            GCS path that holds the model files.
         session (BigQuery Session):
             BQ session to create the model
-        model_path (str):
-            GCS path that holds the model files."""
+    """
 
     def __init__(
         self,
+        model_path: str,
+        *,
         session: Optional[bigframes.Session] = None,
-        model_path: Optional[str] = None,
     ):
         self.session = session or bpd.get_global_session()
         self.model_path = model_path
@@ -59,7 +61,7 @@ def _from_bq(
     ) -> TensorFlowModel:
         assert model.model_type == "TENSORFLOW"
 
-        tf_model = cls(session=session, model_path=None)
+        tf_model = cls(session=session, model_path="")
         tf_model._bqml_model = core.BqmlModel(session, model)
         return tf_model
 
@@ -109,15 +111,17 @@ class ONNXModel(base.Predictor):
     """Imported Open Neural Network Exchange (ONNX) model.
 
     Args:
+        model_path (str):
+            Cloud Storage path that holds the model files.
         session (BigQuery Session):
             BQ session to create the model
-        model_path (str):
-            Cloud Storage path that holds the model files."""
+    """
 
     def __init__(
         self,
+        model_path: str,
+        *,
         session: Optional[bigframes.Session] = None,
-        model_path: Optional[str] = None,
     ):
         self.session = session or bpd.get_global_session()
         self.model_path = model_path
@@ -134,7 +138,7 @@ def _create_bqml_model(self):
     def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ONNXModel:
         assert model.model_type == "ONNX"
 
-        onnx_model = cls(session=session, model_path=None)
+        onnx_model = cls(session=session, model_path="")
         onnx_model._bqml_model = core.BqmlModel(session, model)
         return onnx_model
 
@@ -189,8 +193,8 @@ class XGBoostModel(base.Predictor):
         https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-xgboost#limitations
 
     Args:
-        session (BigQuery Session):
-            BQ session to create the model
+        model_path (str):
+            Cloud Storage path that holds the model files.
         input (Dict, default None):
             Specify the model input schema information when you
             create the XGBoost model. The input should be the format of
@@ -203,15 +207,17 @@ class XGBoostModel(base.Predictor):
             {field_name: field_type}. Output is optional only if feature_names
             and feature_types are both specified in the model file. Supported types
             are "bool", "string", "int64", "float64", "array<bool>", "array<string>", "array<int64>", "array<float64>".
-        model_path (str):
-            Cloud Storage path that holds the model files."""
+        session (BigQuery Session):
+            BQ session to create the model
+    """
 
     def __init__(
         self,
-        session: Optional[bigframes.Session] = None,
+        model_path: str,
+        *,
         input: Mapping[str, str] = {},
         output: Mapping[str, str] = {},
-        model_path: Optional[str] = None,
+        session: Optional[bigframes.Session] = None,
     ):
         self.session = session or bpd.get_global_session()
         self.model_path = model_path
@@ -248,7 +254,7 @@ def _from_bq(
     ) -> XGBoostModel:
         assert model.model_type == "XGBOOST"
 
-        xgboost_model = cls(session=session, model_path=None)
+        xgboost_model = cls(session=session, model_path="")
         xgboost_model._bqml_model = core.BqmlModel(session, model)
         return xgboost_model
 
diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py
@@ -58,6 +58,7 @@ class LinearRegression(
 
     def __init__(
         self,
+        *,
         optimize_strategy: Literal[
             "auto_strategy", "batch_gradient_descent", "normal_equation"
         ] = "normal_equation",
@@ -192,6 +193,7 @@ class LogisticRegression(
     # TODO(ashleyxu) support class_weights in the constructor.
     def __init__(
         self,
+        *,
         fit_intercept: bool = True,
         class_weights: Optional[Union[Literal["balanced"], Dict[str, float]]] = None,
     ):
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
@@ -66,6 +66,7 @@ class PaLM2TextGenerator(base.Predictor):
 
     def __init__(
         self,
+        *,
         model_name: Literal["text-bison", "text-bison-32k"] = "text-bison",
         session: Optional[bigframes.Session] = None,
         connection_name: Optional[str] = None,
@@ -140,6 +141,7 @@ def _from_bq(
     def predict(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
+        *,
         temperature: float = 0.0,
         max_output_tokens: int = 128,
         top_k: int = 40,
@@ -273,6 +275,7 @@ class PaLM2TextEmbeddingGenerator(base.Predictor):
 
     def __init__(
         self,
+        *,
         model_name: Literal[
             "textembedding-gecko", "textembedding-gecko-multilingual"
         ] = "textembedding-gecko",
@@ -415,6 +418,7 @@ class GeminiTextGenerator(base.Predictor):
 
     def __init__(
         self,
+        *,
         session: Optional[bigframes.Session] = None,
         connection_name: Optional[str] = None,
     ):
@@ -475,6 +479,7 @@ def _from_bq(
     def predict(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
+        *,
         temperature: float = 0.9,
         max_output_tokens: int = 8192,
         top_k: int = 40,
diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py
@@ -34,6 +34,7 @@
 def r2_score(
     y_true: Union[bpd.DataFrame, bpd.Series],
     y_pred: Union[bpd.DataFrame, bpd.Series],
+    *,
     force_finite=True,
 ) -> float:
     y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred)
@@ -61,6 +62,7 @@ def r2_score(
 def accuracy_score(
     y_true: Union[bpd.DataFrame, bpd.Series],
     y_pred: Union[bpd.DataFrame, bpd.Series],
+    *,
     normalize=True,
 ) -> float:
     # TODO(ashleyxu): support sample_weight as the parameter
@@ -83,6 +85,7 @@ def accuracy_score(
 def roc_curve(
     y_true: Union[bpd.DataFrame, bpd.Series],
     y_score: Union[bpd.DataFrame, bpd.Series],
+    *,
     drop_intermediate: bool = True,
 ) -> Tuple[bpd.Series, bpd.Series, bpd.Series]:
     # TODO(bmil): Add multi-class support
@@ -227,6 +230,7 @@ def confusion_matrix(
 def recall_score(
     y_true: Union[bpd.DataFrame, bpd.Series],
     y_pred: Union[bpd.DataFrame, bpd.Series],
+    *,
     average: str = "binary",
 ) -> pd.Series:
     # TODO(ashleyxu): support more average type, default to "binary"
@@ -263,6 +267,7 @@ def recall_score(
 def precision_score(
     y_true: Union[bpd.DataFrame, bpd.Series],
     y_pred: Union[bpd.DataFrame, bpd.Series],
+    *,
     average: str = "binary",
 ) -> pd.Series:
     # TODO(ashleyxu): support more average type, default to "binary"
@@ -301,6 +306,7 @@ def precision_score(
 def f1_score(
     y_true: Union[bpd.DataFrame, bpd.Series],
     y_pred: Union[bpd.DataFrame, bpd.Series],
+    *,
     average: str = "binary",
 ) -> pd.Series:
     # TODO(ashleyxu): support more average type, default to "binary"
diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py
@@ -54,6 +54,7 @@ def __init__(
         endpoint: str,
         input: Mapping[str, str],
         output: Mapping[str, str],
+        *,
         session: Optional[bigframes.Session] = None,
         connection_name: Optional[str] = None,
     ):

Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ def _fit(`
`87`	`87`	`)`
`88`	`88`
`89`	`89`	`def predict(`
`90`		`- self, X=None, horizon: int = 3, confidence_level: float = 0.95`
	`90`	`+ self, X=None, *, horizon: int = 3, confidence_level: float = 0.95`
`91`	`91`	`) -> bpd.DataFrame:`
`92`	`92`	`"""Predict the closest cluster for each sample in X.`
`93`	`93`