Get unique list of files we want to download
This commit is contained in:
parent
c5830aeadc
commit
32932fe803
@ -10,9 +10,10 @@ mvw_endpoint = http://localhost:8000/api/query
|
||||
title_dedup_winner = first
|
||||
|
||||
[maus]
|
||||
min_duration = 1800
|
||||
min_duration = 1200
|
||||
max_duration = 2700
|
||||
query = @maus-query.json
|
||||
title_not_regex = audiodeskription|gebärdensprache
|
||||
# query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
|
||||
# state_file_name = maus
|
||||
# tmp_base_dir = %(tmp_base_dir)s/maus
|
||||
|
@ -17,5 +17,5 @@
|
||||
"sortOrder": "desc",
|
||||
"future": false,
|
||||
"offset": 0,
|
||||
"size": 8
|
||||
"size": 50
|
||||
}
|
||||
|
111
mvw-dl.py
111
mvw-dl.py
@ -2,6 +2,7 @@ import configparser
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import requests
|
||||
import inflect
|
||||
@ -9,6 +10,8 @@ from rich.logging import RichHandler
|
||||
from rich.traceback import install
|
||||
import typing as t
|
||||
from rich.console import Console
|
||||
|
||||
import type_def.mvw_json_response
|
||||
from type_def.mvw_json_request import MVWJSONRequest
|
||||
from type_def.mvw_json_response import MVWJSONResponse
|
||||
|
||||
@ -185,7 +188,7 @@ def get_json_response(
|
||||
config_obj: configparser.ConfigParser(),
|
||||
payload: MVWJSONRequest) -> MVWJSONResponse:
|
||||
log.debug(f"Downloading JSON list of Mediathek files that match search criteria")
|
||||
serialized_payload = json.dumps(payload)
|
||||
serialized_payload = payload.json()
|
||||
url = config_obj.get(section_name, "mvw_endpoint")
|
||||
req_header = {"Content-Type": "text/plain"}
|
||||
s = requests.Session()
|
||||
@ -197,31 +200,87 @@ def get_json_response(
|
||||
f"""{newline.join(f"Header '{header}': '{value}'" for header, value in list(req.headers.items()))}\n"""
|
||||
f"Payload: {payload}")
|
||||
with s.send(prepped) as s:
|
||||
# return json.loads(s.content)
|
||||
got_json_response = MVWJSONResponse(**json.loads(s.content))
|
||||
return got_json_response
|
||||
|
||||
|
||||
def remove_result(
|
||||
json_obj: MVWJSONResponse,
|
||||
result_obj: type_def.mvw_json_response.Show) -> MVWJSONResponse:
|
||||
json_obj.result.results.remove(result_obj)
|
||||
json_obj.result.queryInfo.resultCount -= 1
|
||||
return json_obj
|
||||
|
||||
|
||||
def log_result_count(result_count: int, pre_filter: bool = True) -> None:
|
||||
if pre_filter:
|
||||
log.debug(f"""Search result contains {result_count} {p.plural("show", result_count)} going in""")
|
||||
else:
|
||||
log.debug(f"""Search result now contains {result_count} {p.plural("show", result_count)}""")
|
||||
|
||||
|
||||
def filter_json_by_duration(
|
||||
section_name: str,
|
||||
config_obj: configparser.ConfigParser(),
|
||||
json_obj: MVWJSONResponse) -> MVWJSONResponse:
|
||||
min_length = config_obj.getint(section_name, "min_duration")
|
||||
if min_length >= 0:
|
||||
log.debug(f"""Filtering JSON for minimum length of {min_length} {p.plural("second", min_length)}""")
|
||||
console.log(json_obj)
|
||||
#for result in json_obj["result"]["results"]:
|
||||
# console.log(result)
|
||||
# console.log(f"0000000000000000000000")
|
||||
# if not result["duration"] >= min_length:
|
||||
# pass
|
||||
# json_str.
|
||||
# console.log(f"{result}\n......................")
|
||||
# console.log(json_obj)
|
||||
# console.log(f"ssssssssss")
|
||||
min_duration = config_obj.getint(section_name, "min_duration")
|
||||
max_duration = config_obj.getint(section_name, "max_duration")
|
||||
log_result_count(json_obj.result.queryInfo.resultCount)
|
||||
if min_duration >= 0:
|
||||
log.debug(f"Filtering '[{section_name}]' JSON for minimum length of {min_duration} "
|
||||
f"""{p.plural("second", min_duration)} ...""")
|
||||
for result in json_obj.result.results.copy():
|
||||
if not result.duration >= min_duration:
|
||||
remove_result(json_obj, result)
|
||||
if max_duration >= 0:
|
||||
log.debug(f"Filtering '[{section_name}]' JSON for maximum length of {max_duration} "
|
||||
f"""{p.plural("second", max_duration)} ...""")
|
||||
for result in json_obj.result.results.copy():
|
||||
if not result.duration <= max_duration:
|
||||
remove_result(json_obj, result)
|
||||
log_result_count(json_obj.result.queryInfo.resultCount, False)
|
||||
return json_obj
|
||||
|
||||
# json_matches_min_length =
|
||||
pass
|
||||
|
||||
def filter_json_by_title_regex(
|
||||
section_name: str,
|
||||
config_obj: configparser.ConfigParser(),
|
||||
json_obj: MVWJSONResponse) -> MVWJSONResponse:
|
||||
title_not_regex = re.compile(config_obj.get(section_name, "title_not_regex"), re.IGNORECASE)
|
||||
log_result_count(json_obj.result.queryInfo.resultCount)
|
||||
log.debug(f"Filtering '[{section_name}]' JSON by title regular expression")
|
||||
for result in json_obj.result.results.copy():
|
||||
if title_not_regex.search(result.title):
|
||||
remove_result(json_obj, result)
|
||||
log_result_count(json_obj.result.queryInfo.resultCount, False)
|
||||
return json_obj
|
||||
|
||||
|
||||
def dedup_json_titles(
|
||||
section_name: str,
|
||||
config_obj: configparser.ConfigParser(),
|
||||
json_obj: MVWJSONResponse) -> MVWJSONResponse:
|
||||
title_dedup_winner = config_obj.get(section_name, "title_dedup_winner")
|
||||
titles_list = {}
|
||||
log_result_count(json_obj.result.queryInfo.resultCount)
|
||||
for result in json_obj.result.results.copy():
|
||||
if result.title not in titles_list:
|
||||
titles_list[result.title] = {}
|
||||
if result.id not in titles_list[result.title]:
|
||||
titles_list[result.title][result.id] = result.timestamp
|
||||
for result in titles_list.copy():
|
||||
if title_dedup_winner == "first":
|
||||
dd_winner = min(titles_list[result], key=str)
|
||||
else:
|
||||
dd_winner = max(titles_list[result], key=str)
|
||||
titles_list[result] = dd_winner
|
||||
for result in json_obj.result.results.copy():
|
||||
if result.title in titles_list:
|
||||
if result.id != titles_list[result.title]:
|
||||
log.debug(f"""Deduplicating '[{section_name}]' result "{result.title}" ...""")
|
||||
remove_result(json_obj, result)
|
||||
log_result_count(json_obj.result.queryInfo.resultCount, False)
|
||||
return json_obj
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@ -238,10 +297,16 @@ if __name__ == '__main__':
|
||||
log.debug(f"Processing section '[{section}]' ...")
|
||||
query_payload = get_query_payload(section, config)
|
||||
json_response = get_json_response(section, config, query_payload)
|
||||
log.debug(json_response)
|
||||
quit()
|
||||
log.debug(CONST.CFG_KNOWN_SECTION[0])
|
||||
if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"):
|
||||
json_matches_duration = filter_json_by_duration(section, config, json_response)
|
||||
# console.log(json_response)
|
||||
|
||||
log.debug(f"Filtering results by duration where applicable ...")
|
||||
if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"):
|
||||
json_response = filter_json_by_duration(section, config, json_response)
|
||||
|
||||
log.debug(f"Filtering results by title regular expression where applicable ...")
|
||||
if config.has_option(section, "title_not_regex"):
|
||||
json_response = filter_json_by_title_regex(section, config, json_response)
|
||||
|
||||
log.debug(f"Deduplicating results by title where needed ...")
|
||||
if config.has_option(section, "title_not_regex"):
|
||||
json_response = dedup_json_titles(section, config, json_response)
|
||||
# console.print_json(json_response.json())
|
||||
|
Loading…
x
Reference in New Issue
Block a user