mirror of
https://github.com/riwiwa/muzi.git
synced 2026-02-28 11:56:57 -08:00
360 lines
8.8 KiB
Go
360 lines
8.8 KiB
Go
package migrate
|
|
|
|
// Spotify import functionality for migrating Spotify listening history
|
|
// from JSON export files into the database
|
|
|
|
// This file handles:
|
|
// - Parsing Spotify JSON track data
|
|
// - Batch processing with deduplication (20-second window)
|
|
// - Efficient bulk inserts using pgx.CopyFrom
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"muzi/db"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
)
|
|
|
|
const (
|
|
batchSize = 1000
|
|
minPlayTime = 20000 // 20000 ms = 20 sec
|
|
timeDiff = 20 * time.Second
|
|
)
|
|
|
|
// Represents a single listening event from Spotify's JSON export format
|
|
type SpotifyTrack struct {
|
|
Timestamp time.Time `json:"ts"`
|
|
Played int `json:"ms_played"`
|
|
Name string `json:"master_metadata_track_name"`
|
|
Artist string `json:"master_metadata_album_artist_name"`
|
|
Album string `json:"master_metadata_album_album_name"`
|
|
}
|
|
|
|
// Implements pgx.CopyFromSource for efficient bulk inserts.
|
|
// Filters out duplicates in-memory before sending to PostgreSQL
|
|
type trackSource struct {
|
|
tracks []SpotifyTrack // Full batch of tracks to process
|
|
tracksToSkip map[string]struct{} // Set of duplicate keys to skip
|
|
idx int // Current position in tracks slice
|
|
userId int // User ID to associate with imported tracks
|
|
}
|
|
|
|
// Represents a track already stored in the database, used for duplicate
|
|
// detection during import
|
|
type dbTrack struct {
|
|
Timestamp time.Time
|
|
SongName string
|
|
Artist string
|
|
}
|
|
|
|
// Import Spotify listening history into the database.
|
|
// Processes tracks in batches of 1000 (default), filters out tracks played <
|
|
// 20 seconds, deduplicates against existing data, and sends progress updates
|
|
// via progressChan.
|
|
// The progressChan must not be closed by the caller. The receiver should
|
|
// stop reading when Status is "completed". This avoids panics from
|
|
// sending on a closed channel.
|
|
|
|
func ImportSpotify(tracks []SpotifyTrack,
|
|
userId int, progressChan chan ProgressUpdate,
|
|
) {
|
|
totalImported := 0
|
|
totalTracks := len(tracks)
|
|
batchStart := 0
|
|
totalBatches := (totalTracks + batchSize - 1) / batchSize
|
|
|
|
// Send initial progress update
|
|
sendProgressUpdate(progressChan, 0, 0, totalBatches, totalImported, "running")
|
|
|
|
for batchStart < totalTracks {
|
|
// Cap batchEnd at total track count on final batch to prevent OOB error
|
|
batchEnd := min(batchStart+batchSize, totalTracks)
|
|
currentBatch := (batchStart / batchSize) + 1
|
|
|
|
var validTracks []SpotifyTrack
|
|
for i := batchStart; i < batchEnd; i++ {
|
|
if tracks[i].Played >= minPlayTime &&
|
|
tracks[i].Name != "" &&
|
|
tracks[i].Artist != "" {
|
|
validTracks = append(validTracks, tracks[i])
|
|
}
|
|
}
|
|
|
|
if len(validTracks) == 0 {
|
|
batchStart += batchSize
|
|
// Send progress update even for empty batches
|
|
sendProgressUpdate(
|
|
progressChan,
|
|
currentBatch,
|
|
currentBatch,
|
|
totalBatches,
|
|
totalImported,
|
|
"running",
|
|
)
|
|
continue
|
|
}
|
|
|
|
tracksToSkip, err := getDupes(userId, validTracks)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Error checking existing tracks: %v\n", err)
|
|
batchStart += batchSize
|
|
continue
|
|
}
|
|
|
|
src := &trackSource{
|
|
tracks: validTracks,
|
|
tracksToSkip: tracksToSkip,
|
|
idx: 0,
|
|
userId: userId,
|
|
}
|
|
|
|
copyCount, err := db.Pool.CopyFrom(
|
|
context.Background(),
|
|
pgx.Identifier{"history"},
|
|
[]string{
|
|
"user_id",
|
|
"timestamp",
|
|
"song_name",
|
|
"artist",
|
|
"album_name",
|
|
"ms_played",
|
|
"platform",
|
|
},
|
|
src,
|
|
)
|
|
// Do not log errors that come from adding duplicate songs
|
|
if err != nil {
|
|
if !strings.Contains(err.Error(), "duplicate") {
|
|
fmt.Fprintf(os.Stderr, "Spotify batch insert failed: %v\n", err)
|
|
}
|
|
} else {
|
|
totalImported += int(copyCount)
|
|
}
|
|
|
|
sendProgressUpdate(
|
|
progressChan,
|
|
currentBatch,
|
|
currentBatch,
|
|
totalBatches,
|
|
totalImported,
|
|
"running",
|
|
)
|
|
|
|
batchStart += batchSize
|
|
}
|
|
|
|
sendProgressUpdate(
|
|
progressChan,
|
|
totalBatches,
|
|
totalBatches,
|
|
totalBatches,
|
|
totalImported,
|
|
"completed",
|
|
)
|
|
}
|
|
|
|
// Sends a progress update to the channel if it's not nil.
|
|
// To avoid panics from sending on a closed channel, the channel
|
|
// must never be closed by the receiver. The receiver should stop reading when
|
|
// Status reads "completed".
|
|
func sendProgressUpdate(
|
|
ch chan ProgressUpdate,
|
|
current, completed, total, imported int,
|
|
status string,
|
|
) {
|
|
if ch != nil {
|
|
ch <- ProgressUpdate{
|
|
CurrentPage: current,
|
|
CompletedPages: completed,
|
|
TotalPages: total,
|
|
TracksImported: imported,
|
|
Status: status,
|
|
}
|
|
}
|
|
}
|
|
|
|
// Finds tracks that already exist in the database or are duplicates within the
|
|
// current batch, using a 20-second window to handle minor timestamp variations
|
|
func getDupes(userId int, tracks []SpotifyTrack) (map[string]struct{}, error) {
|
|
minTs, maxTs := findTimeRange(tracks)
|
|
if minTs.IsZero() {
|
|
return map[string]struct{}{}, nil
|
|
}
|
|
|
|
dbTracks, err := fetchDbTracks(userId, minTs, maxTs)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
dbIndex := buildDbTrackIndex(dbTracks)
|
|
duplicates := make(map[string]struct{})
|
|
seenInBatch := make(map[string]struct{})
|
|
|
|
for _, track := range tracks {
|
|
trackKey := createTrackKey(track)
|
|
|
|
// Check in batch
|
|
if _, seen := seenInBatch[trackKey]; seen {
|
|
duplicates[trackKey] = struct{}{}
|
|
continue
|
|
}
|
|
seenInBatch[trackKey] = struct{}{}
|
|
|
|
// Check in DB
|
|
lookupKey := fmt.Sprintf("%s|%s", track.Artist, track.Name)
|
|
if dbTimestamps, found := dbIndex[lookupKey]; found {
|
|
if isDuplicateWithinWindow(track, dbTimestamps) {
|
|
duplicates[trackKey] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
|
|
return duplicates, nil
|
|
}
|
|
|
|
// Get the min/max timestamp range for a batch of tracks
|
|
func findTimeRange(tracks []SpotifyTrack) (time.Time, time.Time) {
|
|
var minTs, maxTs time.Time
|
|
for _, t := range tracks {
|
|
if minTs.IsZero() || t.Timestamp.Before(minTs) {
|
|
minTs = t.Timestamp
|
|
}
|
|
if t.Timestamp.After(maxTs) {
|
|
maxTs = t.Timestamp
|
|
}
|
|
}
|
|
return minTs, maxTs
|
|
}
|
|
|
|
// Get all tracks in the database for a user that have the same timestamp
|
|
// range as the current batch
|
|
func fetchDbTracks(userId int, minTs, maxTs time.Time) ([]dbTrack, error) {
|
|
rows, err := db.Pool.Query(context.Background(),
|
|
`SELECT song_name, artist, timestamp
|
|
FROM history
|
|
WHERE user_id = $1
|
|
AND timestamp BETWEEN $2 AND $3`,
|
|
userId,
|
|
// Adjust 20 seconds to find duplicates on edges of batch
|
|
minTs.Add(-timeDiff),
|
|
maxTs.Add(timeDiff))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
var dbTracks []dbTrack
|
|
for rows.Next() {
|
|
var t dbTrack
|
|
if err := rows.Scan(&t.SongName, &t.Artist, &t.Timestamp); err != nil {
|
|
continue
|
|
}
|
|
dbTracks = append(dbTracks, t)
|
|
}
|
|
err = rows.Err()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return dbTracks, nil
|
|
}
|
|
|
|
// Create a lookup map from Artist|Name to timestamps for efficient duplicate
|
|
// detection.
|
|
func buildDbTrackIndex(tracks []dbTrack) map[string][]time.Time {
|
|
index := make(map[string][]time.Time)
|
|
for _, t := range tracks {
|
|
key := t.Artist + "|" + t.SongName
|
|
index[key] = append(index[key], t.Timestamp)
|
|
}
|
|
return index
|
|
}
|
|
|
|
// Generate a unique identifier for a track using artist, name, and
|
|
// normalized timestamp.
|
|
func createTrackKey(track SpotifyTrack) string {
|
|
ts := track.Timestamp.Format(time.RFC3339Nano)
|
|
return fmt.Sprintf("%s|%s|%s", track.Artist, track.Name, ts)
|
|
}
|
|
|
|
// Check if a track timestamp falls < 20 seconds of another
|
|
func isDuplicateWithinWindow(track SpotifyTrack,
|
|
existingTimestamps []time.Time,
|
|
) bool {
|
|
for _, existingTime := range existingTimestamps {
|
|
diff := track.Timestamp.Sub(existingTime)
|
|
if diff < 0 {
|
|
diff = -diff
|
|
}
|
|
if diff < timeDiff {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Advances to the next valid track, skipping duplicates and invalid timestamps.
|
|
// Returns false when all tracks have been processed
|
|
func (s *trackSource) Next() bool {
|
|
for s.idx < len(s.tracks) {
|
|
t := s.tracks[s.idx]
|
|
key := createTrackKey(t)
|
|
if _, shouldSkip := s.tracksToSkip[key]; shouldSkip {
|
|
s.idx++
|
|
continue
|
|
}
|
|
s.idx++
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Returns the current track's data formatted for database insertion.
|
|
// Must only be called after Next() returns true
|
|
func (s *trackSource) Values() ([]any, error) {
|
|
// idx is already incremented in Next(), so use idx-1
|
|
t := s.tracks[s.idx-1]
|
|
return []any{
|
|
s.userId,
|
|
t.Timestamp,
|
|
t.Name,
|
|
t.Artist,
|
|
t.Album,
|
|
t.Played,
|
|
"spotify",
|
|
}, nil
|
|
}
|
|
|
|
// Returns any error encountered during iteration.
|
|
// Currently always returns nil as errors are logged in Next()
|
|
func (s *trackSource) Err() error {
|
|
return nil
|
|
}
|
|
|
|
// Implements custom JSON unmarshaling to parse the timestamp
|
|
func (s *SpotifyTrack) UnmarshalJSON(data []byte) error {
|
|
type Alias SpotifyTrack
|
|
aux := &struct {
|
|
Timestamp string `json:"ts"`
|
|
*Alias
|
|
}{
|
|
Alias: (*Alias)(s),
|
|
}
|
|
|
|
if err := json.Unmarshal(data, &aux); err != nil {
|
|
return err
|
|
}
|
|
|
|
ts, err := time.Parse(time.RFC3339Nano, aux.Timestamp)
|
|
if err != nil {
|
|
return fmt.Errorf("parsing timestamp: %w", err)
|
|
}
|
|
s.Timestamp = ts
|
|
return nil
|
|
}
|