# RQ-KMeans Semantic-ID (SID) generation (FAISS residual k-means). # Example config wired to the small Parquet sample under data/sid_example_v2/item_only. # Production scale (full data) uses batch_size: 2048 and codebook: 8192 x3. # NOTE: SidRqkmeans is CPU-only and single-process -> launch with --nproc-per-node=1. train_input_path: "data/sid_example/item_only/*.parquet" eval_input_path: "data/sid_example/item_only/*.parquet" model_dir: "experiments/sid_rqkmeans" train_config { sparse_optimizer { adagrad_optimizer { lr: 0.001 } constant_learning_rate {} } dense_optimizer { adam_optimizer { lr: 0.00002 } constant_learning_rate {} } num_epochs: 1 save_checkpoints_steps: 0 save_checkpoints_epochs: 0 log_step_count_steps: 100 } data_config { batch_size: 512 dataset_type: ParquetDataset fg_mode: FG_DAG num_workers: 4 } feature_configs { raw_feature { feature_name: "emb" expression: "item:embedding" value_dim: 512 } } model_config { feature_groups { group_name: "deep" feature_names: "emb" group_type: DEEP } sid_rqkmeans { codebook: 256 codebook: 256 codebook: 256 normalize_residuals: true faiss_kmeans_kwargs { niter: 20 seed: 42 verbose: true spherical: false } } }