mirror of
https://github.com/riwiwa/muzi.git
synced 2026-02-28 11:56:57 -08:00
clean up backend spotify import functionality
This commit is contained in:
@@ -12,6 +12,8 @@ import (
|
|||||||
"github.com/jackc/pgx/v5"
|
"github.com/jackc/pgx/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const batchSize = 1000
|
||||||
|
|
||||||
type SpotifyTrack struct {
|
type SpotifyTrack struct {
|
||||||
Timestamp string `json:"ts"`
|
Timestamp string `json:"ts"`
|
||||||
Played int `json:"ms_played"`
|
Played int `json:"ms_played"`
|
||||||
@@ -20,25 +22,30 @@ type SpotifyTrack struct {
|
|||||||
Album string `json:"master_metadata_album_album_name"`
|
Album string `json:"master_metadata_album_album_name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type existingTrack struct {
|
type trackSource struct {
|
||||||
|
tracks []SpotifyTrack
|
||||||
|
tracksToSkip map[string]struct{}
|
||||||
|
idx int
|
||||||
|
userId int
|
||||||
|
}
|
||||||
|
|
||||||
|
type dbTrack struct {
|
||||||
Timestamp time.Time
|
Timestamp time.Time
|
||||||
SongName string
|
SongName string
|
||||||
Artist string
|
Artist string
|
||||||
}
|
}
|
||||||
|
|
||||||
// trackSource implements pgx.CopyFromSource for efficient bulk inserts
|
|
||||||
type trackSource struct {
|
|
||||||
tracks []SpotifyTrack
|
|
||||||
existing map[string]struct{}
|
|
||||||
idx int
|
|
||||||
userId int
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *trackSource) Next() bool {
|
func (s *trackSource) Next() bool {
|
||||||
for s.idx < len(s.tracks) {
|
for s.idx < len(s.tracks) {
|
||||||
t := s.tracks[s.idx]
|
t := s.tracks[s.idx]
|
||||||
key := fmt.Sprintf("%s|%s|%s", t.Artist, t.Name, t.Timestamp)
|
ts, err := normalizeTs(t.Timestamp)
|
||||||
if _, exists := s.existing[key]; exists {
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error normalizing timestamp: %v\n", err)
|
||||||
|
s.idx++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := fmt.Sprintf("%s|%s|%s", t.Artist, t.Name, ts)
|
||||||
|
if _, shouldSkip := s.tracksToSkip[key]; shouldSkip {
|
||||||
s.idx++
|
s.idx++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -51,7 +58,6 @@ func (s *trackSource) Next() bool {
|
|||||||
func (s *trackSource) Values() ([]any, error) {
|
func (s *trackSource) Values() ([]any, error) {
|
||||||
// idx is already incremented in Next(), so use idx-1
|
// idx is already incremented in Next(), so use idx-1
|
||||||
t := s.tracks[s.idx-1]
|
t := s.tracks[s.idx-1]
|
||||||
// parse spotify string timestamp to a real time object
|
|
||||||
ts, err := time.Parse(time.RFC3339Nano, t.Timestamp)
|
ts, err := time.Parse(time.RFC3339Nano, t.Timestamp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -71,48 +77,62 @@ func (s *trackSource) Err() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// find tracks with the same artist and name in a 20 second +- timeframe
|
func normalizeTs(ts string) (string, error) {
|
||||||
func getExistingTracks(
|
t, err := time.Parse(time.RFC3339Nano, ts)
|
||||||
userId int,
|
|
||||||
tracks []SpotifyTrack,
|
|
||||||
) (map[string]struct{}, error) {
|
|
||||||
// check for empty track import
|
|
||||||
if len(tracks) == 0 {
|
|
||||||
return map[string]struct{}{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// find min/max timestamps in this batch to create time window to
|
|
||||||
// search for duplicates
|
|
||||||
var minTs, maxTs time.Time
|
|
||||||
// go through each track (t) in the array
|
|
||||||
for _, t := range tracks {
|
|
||||||
// parse spotify timestamp into operational time datatype
|
|
||||||
ts, err := time.Parse(time.RFC3339Nano, t.Timestamp)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
return "", err
|
||||||
}
|
|
||||||
// if minTs uninitialized or timestamp predates minTs
|
|
||||||
if minTs.IsZero() || ts.Before(minTs) {
|
|
||||||
minTs = ts
|
|
||||||
}
|
|
||||||
// if timestamp comes after maxTs
|
|
||||||
if ts.After(maxTs) {
|
|
||||||
maxTs = ts
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
return t.Format(time.RFC3339Nano), nil
|
||||||
|
}
|
||||||
|
|
||||||
// check if all parses failed, therefore no way to find duplicate by time
|
func getExistingTracks(
|
||||||
|
userId int, tracks []SpotifyTrack,
|
||||||
|
) (map[string]struct{}, error) {
|
||||||
|
minTs, maxTs := findTimeRange(tracks)
|
||||||
if minTs.IsZero() {
|
if minTs.IsZero() {
|
||||||
return map[string]struct{}{}, nil
|
return map[string]struct{}{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// find all tracks within [min-20s, max+20s] window (duplicates)
|
dbTracks, err := fetchDbTracks(userId, minTs, maxTs)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
dbIndex := buildDbTrackIndex(dbTracks)
|
||||||
|
|
||||||
|
return findDuplicates(tracks, dbIndex), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the min/max timestamp range for a batch of tracks
|
||||||
|
func findTimeRange(tracks []SpotifyTrack) (time.Time, time.Time) {
|
||||||
|
var minTs, maxTs time.Time
|
||||||
|
for _, t := range tracks {
|
||||||
|
ts, err := time.Parse(time.RFC3339Nano, t.Timestamp)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if minTs.IsZero() || ts.Before(minTs) {
|
||||||
|
minTs = ts
|
||||||
|
}
|
||||||
|
if ts.After(maxTs) {
|
||||||
|
maxTs = ts
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return minTs, maxTs
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
get all tracks in the database for a user that have the same timestamp
|
||||||
|
range as the current batch
|
||||||
|
*/
|
||||||
|
func fetchDbTracks(userId int, minTs, maxTs time.Time) ([]dbTrack, error) {
|
||||||
rows, err := db.Pool.Query(context.Background(),
|
rows, err := db.Pool.Query(context.Background(),
|
||||||
`SELECT song_name, artist, timestamp
|
`SELECT song_name, artist, timestamp
|
||||||
FROM history
|
FROM history
|
||||||
WHERE user_id = $1
|
WHERE user_id = $1
|
||||||
AND timestamp BETWEEN $2 AND $3`,
|
AND timestamp BETWEEN $2 AND $3`,
|
||||||
userId,
|
userId,
|
||||||
|
// adjust 20 seconds to find duplicates on edges of batch
|
||||||
minTs.Add(-20*time.Second),
|
minTs.Add(-20*time.Second),
|
||||||
maxTs.Add(20*time.Second))
|
maxTs.Add(20*time.Second))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -120,102 +140,118 @@ func getExistingTracks(
|
|||||||
}
|
}
|
||||||
defer rows.Close()
|
defer rows.Close()
|
||||||
|
|
||||||
// prepare map to hold duplicate track keys
|
var dbTracks []dbTrack
|
||||||
existing := make(map[string]struct{})
|
|
||||||
// create array of tracks
|
|
||||||
var existingTracks []existingTrack
|
|
||||||
// for each repeat play (-20s +20s)
|
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
// write the data from json to the track in memory
|
var t dbTrack
|
||||||
var t existingTrack
|
|
||||||
if err := rows.Scan(&t.SongName, &t.Artist, &t.Timestamp); err != nil {
|
if err := rows.Scan(&t.SongName, &t.Artist, &t.Timestamp); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// add track in memory to existingTracks array
|
dbTracks = append(dbTracks, t)
|
||||||
existingTracks = append(existingTracks, t)
|
|
||||||
}
|
}
|
||||||
|
return dbTracks, nil
|
||||||
|
}
|
||||||
|
|
||||||
// index existing tracks by artist|name for O(1) lookup
|
func buildDbTrackIndex(tracks []dbTrack) map[string][]time.Time {
|
||||||
existingIndex := make(map[string][]time.Time)
|
index := make(map[string][]time.Time)
|
||||||
for _, t := range existingTracks {
|
for _, t := range tracks {
|
||||||
key := t.Artist + "|" + t.SongName
|
key := t.Artist + "|" + t.SongName
|
||||||
existingIndex[key] = append(existingIndex[key], t.Timestamp)
|
index[key] = append(index[key], t.Timestamp)
|
||||||
}
|
}
|
||||||
|
return index
|
||||||
|
}
|
||||||
|
|
||||||
// check each new track against indexed existing tracks
|
func findDuplicates(tracks []SpotifyTrack, dbIndex map[string][]time.Time) map[string]struct{} {
|
||||||
for _, newTrack := range tracks {
|
duplicates := make(map[string]struct{})
|
||||||
newTs, err := time.Parse(time.RFC3339Nano, newTrack.Timestamp)
|
seenInBatch := make(map[string]struct{})
|
||||||
|
|
||||||
|
for _, track := range tracks {
|
||||||
|
trackKey, err := createTrackKey(track)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
lookupKey := newTrack.Artist + "|" + newTrack.Name
|
// in batch check
|
||||||
if timestamps, found := existingIndex[lookupKey]; found {
|
if _, seen := seenInBatch[trackKey]; seen {
|
||||||
for _, existTs := range timestamps {
|
duplicates[trackKey] = struct{}{}
|
||||||
diff := newTs.Sub(existTs)
|
continue
|
||||||
|
}
|
||||||
|
seenInBatch[trackKey] = struct{}{}
|
||||||
|
|
||||||
|
// in db check
|
||||||
|
lookupKey := fmt.Sprintf("%s|%s", track.Artist, track.Name)
|
||||||
|
if dbTimestamps, found := dbIndex[lookupKey]; found {
|
||||||
|
if isDuplicateWithinWindow(track, dbTimestamps) {
|
||||||
|
duplicates[trackKey] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return duplicates
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTrackKey(track SpotifyTrack) (string, error) {
|
||||||
|
ts, err := normalizeTs(track.Timestamp)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%s|%s|%s", track.Artist, track.Name, ts), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if a track timestamp falls < 20 seconds of another
|
||||||
|
func isDuplicateWithinWindow(track SpotifyTrack, existingTimestamps []time.Time) bool {
|
||||||
|
trackTime, err := time.Parse(time.RFC3339Nano, track.Timestamp)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, existingTime := range existingTimestamps {
|
||||||
|
diff := trackTime.Sub(existingTime)
|
||||||
if diff < 0 {
|
if diff < 0 {
|
||||||
diff = -diff
|
diff = -diff
|
||||||
}
|
}
|
||||||
if diff < 20*time.Second {
|
if diff < 20*time.Second {
|
||||||
key := fmt.Sprintf(
|
return true
|
||||||
"%s|%s|%s",
|
|
||||||
newTrack.Artist,
|
|
||||||
newTrack.Name,
|
|
||||||
newTrack.Timestamp,
|
|
||||||
)
|
|
||||||
existing[key] = struct{}{}
|
|
||||||
break
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return false
|
||||||
}
|
|
||||||
|
|
||||||
return existing, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func ImportSpotify(tracks []SpotifyTrack, userId int) error {
|
func ImportSpotify(tracks []SpotifyTrack, userId int) error {
|
||||||
totalImported := 0
|
totalImported := 0
|
||||||
batchSize := 1000
|
totalTracks := len(tracks)
|
||||||
|
batchStart := 0
|
||||||
|
|
||||||
for batchStart := 0; batchStart < len(tracks); batchStart += batchSize {
|
for batchStart < totalTracks {
|
||||||
// get the limit of the current batch
|
// cap batchEnd at total track count on final batch to prevent OOB error
|
||||||
batchEnd := batchStart + batchSize
|
batchEnd := min(batchStart+batchSize, totalTracks)
|
||||||
// set limit to track array length in current batch too big
|
|
||||||
if batchEnd > len(tracks) {
|
|
||||||
batchEnd = len(tracks)
|
|
||||||
}
|
|
||||||
|
|
||||||
// create array to hold valid listens
|
|
||||||
var validTracks []SpotifyTrack
|
var validTracks []SpotifyTrack
|
||||||
for i := batchStart; i < batchEnd; i++ {
|
for i := batchStart; i < batchEnd; i++ {
|
||||||
// if current track is listened to for 20 sec and name and artist is not
|
if tracks[i].Played >= 20000 && // 20 seconds
|
||||||
// blank, add to validTracks array
|
tracks[i].Name != "" &&
|
||||||
if tracks[i].Played >= 20000 && tracks[i].Name != "" && tracks[i].Artist != "" {
|
tracks[i].Artist != "" {
|
||||||
validTracks = append(validTracks, tracks[i])
|
validTracks = append(validTracks, tracks[i])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if there are no valid tracks in this batch, go to the next
|
|
||||||
if len(validTracks) == 0 {
|
if len(validTracks) == 0 {
|
||||||
|
batchStart += batchSize
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// find replayed tracks in the batch that was just gathered
|
tracksToSkip, err := getExistingTracks(userId, validTracks)
|
||||||
existing, err := getExistingTracks(userId, validTracks)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "Error checking existing tracks: %v\n", err)
|
fmt.Fprintf(os.Stderr, "Error checking existing tracks: %v\n", err)
|
||||||
|
batchStart += batchSize
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// get data of struct pointer
|
|
||||||
src := &trackSource{
|
src := &trackSource{
|
||||||
tracks: validTracks,
|
tracks: validTracks,
|
||||||
existing: existing,
|
tracksToSkip: tracksToSkip,
|
||||||
idx: 0,
|
idx: 0,
|
||||||
userId: userId,
|
userId: userId,
|
||||||
}
|
}
|
||||||
|
|
||||||
// insert all valid tracks from current batch into db
|
|
||||||
copyCount, err := db.Pool.CopyFrom(
|
copyCount, err := db.Pool.CopyFrom(
|
||||||
context.Background(),
|
context.Background(),
|
||||||
pgx.Identifier{"history"},
|
pgx.Identifier{"history"},
|
||||||
@@ -237,6 +273,7 @@ func ImportSpotify(tracks []SpotifyTrack, userId int) error {
|
|||||||
} else {
|
} else {
|
||||||
totalImported += int(copyCount)
|
totalImported += int(copyCount)
|
||||||
}
|
}
|
||||||
|
batchStart += batchSize
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user