Get unique list of files we want to download
This commit is contained in:
parent
c5830aeadc
commit
32932fe803
@ -10,9 +10,10 @@ mvw_endpoint = http://localhost:8000/api/query
|
|||||||
title_dedup_winner = first
|
title_dedup_winner = first
|
||||||
|
|
||||||
[maus]
|
[maus]
|
||||||
min_duration = 1800
|
min_duration = 1200
|
||||||
max_duration = 2700
|
max_duration = 2700
|
||||||
query = @maus-query.json
|
query = @maus-query.json
|
||||||
|
title_not_regex = audiodeskription|gebärdensprache
|
||||||
# query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
|
# query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
|
||||||
# state_file_name = maus
|
# state_file_name = maus
|
||||||
# tmp_base_dir = %(tmp_base_dir)s/maus
|
# tmp_base_dir = %(tmp_base_dir)s/maus
|
||||||
|
@ -17,5 +17,5 @@
|
|||||||
"sortOrder": "desc",
|
"sortOrder": "desc",
|
||||||
"future": false,
|
"future": false,
|
||||||
"offset": 0,
|
"offset": 0,
|
||||||
"size": 8
|
"size": 50
|
||||||
}
|
}
|
||||||
|
111
mvw-dl.py
111
mvw-dl.py
@ -2,6 +2,7 @@ import configparser
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
import inflect
|
import inflect
|
||||||
@ -9,6 +10,8 @@ from rich.logging import RichHandler
|
|||||||
from rich.traceback import install
|
from rich.traceback import install
|
||||||
import typing as t
|
import typing as t
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
|
||||||
|
import type_def.mvw_json_response
|
||||||
from type_def.mvw_json_request import MVWJSONRequest
|
from type_def.mvw_json_request import MVWJSONRequest
|
||||||
from type_def.mvw_json_response import MVWJSONResponse
|
from type_def.mvw_json_response import MVWJSONResponse
|
||||||
|
|
||||||
@ -185,7 +188,7 @@ def get_json_response(
|
|||||||
config_obj: configparser.ConfigParser(),
|
config_obj: configparser.ConfigParser(),
|
||||||
payload: MVWJSONRequest) -> MVWJSONResponse:
|
payload: MVWJSONRequest) -> MVWJSONResponse:
|
||||||
log.debug(f"Downloading JSON list of Mediathek files that match search criteria")
|
log.debug(f"Downloading JSON list of Mediathek files that match search criteria")
|
||||||
serialized_payload = json.dumps(payload)
|
serialized_payload = payload.json()
|
||||||
url = config_obj.get(section_name, "mvw_endpoint")
|
url = config_obj.get(section_name, "mvw_endpoint")
|
||||||
req_header = {"Content-Type": "text/plain"}
|
req_header = {"Content-Type": "text/plain"}
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
@ -197,31 +200,87 @@ def get_json_response(
|
|||||||
f"""{newline.join(f"Header '{header}': '{value}'" for header, value in list(req.headers.items()))}\n"""
|
f"""{newline.join(f"Header '{header}': '{value}'" for header, value in list(req.headers.items()))}\n"""
|
||||||
f"Payload: {payload}")
|
f"Payload: {payload}")
|
||||||
with s.send(prepped) as s:
|
with s.send(prepped) as s:
|
||||||
# return json.loads(s.content)
|
|
||||||
got_json_response = MVWJSONResponse(**json.loads(s.content))
|
got_json_response = MVWJSONResponse(**json.loads(s.content))
|
||||||
return got_json_response
|
return got_json_response
|
||||||
|
|
||||||
|
|
||||||
|
def remove_result(
|
||||||
|
json_obj: MVWJSONResponse,
|
||||||
|
result_obj: type_def.mvw_json_response.Show) -> MVWJSONResponse:
|
||||||
|
json_obj.result.results.remove(result_obj)
|
||||||
|
json_obj.result.queryInfo.resultCount -= 1
|
||||||
|
return json_obj
|
||||||
|
|
||||||
|
|
||||||
|
def log_result_count(result_count: int, pre_filter: bool = True) -> None:
|
||||||
|
if pre_filter:
|
||||||
|
log.debug(f"""Search result contains {result_count} {p.plural("show", result_count)} going in""")
|
||||||
|
else:
|
||||||
|
log.debug(f"""Search result now contains {result_count} {p.plural("show", result_count)}""")
|
||||||
|
|
||||||
|
|
||||||
def filter_json_by_duration(
|
def filter_json_by_duration(
|
||||||
section_name: str,
|
section_name: str,
|
||||||
config_obj: configparser.ConfigParser(),
|
config_obj: configparser.ConfigParser(),
|
||||||
json_obj: MVWJSONResponse) -> MVWJSONResponse:
|
json_obj: MVWJSONResponse) -> MVWJSONResponse:
|
||||||
min_length = config_obj.getint(section_name, "min_duration")
|
min_duration = config_obj.getint(section_name, "min_duration")
|
||||||
if min_length >= 0:
|
max_duration = config_obj.getint(section_name, "max_duration")
|
||||||
log.debug(f"""Filtering JSON for minimum length of {min_length} {p.plural("second", min_length)}""")
|
log_result_count(json_obj.result.queryInfo.resultCount)
|
||||||
console.log(json_obj)
|
if min_duration >= 0:
|
||||||
#for result in json_obj["result"]["results"]:
|
log.debug(f"Filtering '[{section_name}]' JSON for minimum length of {min_duration} "
|
||||||
# console.log(result)
|
f"""{p.plural("second", min_duration)} ...""")
|
||||||
# console.log(f"0000000000000000000000")
|
for result in json_obj.result.results.copy():
|
||||||
# if not result["duration"] >= min_length:
|
if not result.duration >= min_duration:
|
||||||
# pass
|
remove_result(json_obj, result)
|
||||||
# json_str.
|
if max_duration >= 0:
|
||||||
# console.log(f"{result}\n......................")
|
log.debug(f"Filtering '[{section_name}]' JSON for maximum length of {max_duration} "
|
||||||
# console.log(json_obj)
|
f"""{p.plural("second", max_duration)} ...""")
|
||||||
# console.log(f"ssssssssss")
|
for result in json_obj.result.results.copy():
|
||||||
|
if not result.duration <= max_duration:
|
||||||
|
remove_result(json_obj, result)
|
||||||
|
log_result_count(json_obj.result.queryInfo.resultCount, False)
|
||||||
|
return json_obj
|
||||||
|
|
||||||
# json_matches_min_length =
|
|
||||||
pass
|
def filter_json_by_title_regex(
|
||||||
|
section_name: str,
|
||||||
|
config_obj: configparser.ConfigParser(),
|
||||||
|
json_obj: MVWJSONResponse) -> MVWJSONResponse:
|
||||||
|
title_not_regex = re.compile(config_obj.get(section_name, "title_not_regex"), re.IGNORECASE)
|
||||||
|
log_result_count(json_obj.result.queryInfo.resultCount)
|
||||||
|
log.debug(f"Filtering '[{section_name}]' JSON by title regular expression")
|
||||||
|
for result in json_obj.result.results.copy():
|
||||||
|
if title_not_regex.search(result.title):
|
||||||
|
remove_result(json_obj, result)
|
||||||
|
log_result_count(json_obj.result.queryInfo.resultCount, False)
|
||||||
|
return json_obj
|
||||||
|
|
||||||
|
|
||||||
|
def dedup_json_titles(
|
||||||
|
section_name: str,
|
||||||
|
config_obj: configparser.ConfigParser(),
|
||||||
|
json_obj: MVWJSONResponse) -> MVWJSONResponse:
|
||||||
|
title_dedup_winner = config_obj.get(section_name, "title_dedup_winner")
|
||||||
|
titles_list = {}
|
||||||
|
log_result_count(json_obj.result.queryInfo.resultCount)
|
||||||
|
for result in json_obj.result.results.copy():
|
||||||
|
if result.title not in titles_list:
|
||||||
|
titles_list[result.title] = {}
|
||||||
|
if result.id not in titles_list[result.title]:
|
||||||
|
titles_list[result.title][result.id] = result.timestamp
|
||||||
|
for result in titles_list.copy():
|
||||||
|
if title_dedup_winner == "first":
|
||||||
|
dd_winner = min(titles_list[result], key=str)
|
||||||
|
else:
|
||||||
|
dd_winner = max(titles_list[result], key=str)
|
||||||
|
titles_list[result] = dd_winner
|
||||||
|
for result in json_obj.result.results.copy():
|
||||||
|
if result.title in titles_list:
|
||||||
|
if result.id != titles_list[result.title]:
|
||||||
|
log.debug(f"""Deduplicating '[{section_name}]' result "{result.title}" ...""")
|
||||||
|
remove_result(json_obj, result)
|
||||||
|
log_result_count(json_obj.result.queryInfo.resultCount, False)
|
||||||
|
return json_obj
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@ -238,10 +297,16 @@ if __name__ == '__main__':
|
|||||||
log.debug(f"Processing section '[{section}]' ...")
|
log.debug(f"Processing section '[{section}]' ...")
|
||||||
query_payload = get_query_payload(section, config)
|
query_payload = get_query_payload(section, config)
|
||||||
json_response = get_json_response(section, config, query_payload)
|
json_response = get_json_response(section, config, query_payload)
|
||||||
log.debug(json_response)
|
|
||||||
quit()
|
|
||||||
log.debug(CONST.CFG_KNOWN_SECTION[0])
|
|
||||||
if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"):
|
|
||||||
json_matches_duration = filter_json_by_duration(section, config, json_response)
|
|
||||||
# console.log(json_response)
|
|
||||||
|
|
||||||
|
log.debug(f"Filtering results by duration where applicable ...")
|
||||||
|
if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"):
|
||||||
|
json_response = filter_json_by_duration(section, config, json_response)
|
||||||
|
|
||||||
|
log.debug(f"Filtering results by title regular expression where applicable ...")
|
||||||
|
if config.has_option(section, "title_not_regex"):
|
||||||
|
json_response = filter_json_by_title_regex(section, config, json_response)
|
||||||
|
|
||||||
|
log.debug(f"Deduplicating results by title where needed ...")
|
||||||
|
if config.has_option(section, "title_not_regex"):
|
||||||
|
json_response = dedup_json_titles(section, config, json_response)
|
||||||
|
# console.print_json(json_response.json())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user