Skip to content

Instantly share code, notes, and snippets.

@zelsaddr
Last active December 22, 2021 07:50
Show Gist options
  • Select an option

  • Save zelsaddr/1987c1af3ca5d3fafb196ecf05648660 to your computer and use it in GitHub Desktop.

Select an option

Save zelsaddr/1987c1af3ca5d3fafb196ecf05648660 to your computer and use it in GitHub Desktop.
Jurnal Search by URL Sinta (Filtering Indonesian Language Only)
import requests, re, time, json
from colorama import Fore, Back, Style
import urllib3
## library required (colorama, requests) if not yet installed just type : pip install requests colorama
## Example to use for URL form : https://sinta.kemdikbud.go.id/journals/detail?page=2&id=1270 (page parameter is required)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
journal = {}
journal_ind = {}
def checkFromIndonesia(text):
TL = requests.Session()
cookies = {
'OGPC': '19022622-1:',
'HSID': 'AjsTTEwzVJWMZQWGD',
'SSID': 'ACMtLMERP3mYjaz2g',
'APISID': 'x_cSt6FZ9K_OFNjX/AFnQoiPMjll6gXx23',
'SAPISID': 'vUOZPdeoHKkmZI83/AdJPQAX2Tl2dYZXbG',
'__Secure-1PAPISID': 'vUOZPdeoHKkmZI83/AdJPQAX2Tl2dYZXbG',
'__Secure-3PAPISID': 'vUOZPdeoHKkmZI83/AdJPQAX2Tl2dYZXbG',
'SID': 'Ewh6Ijl9oQFrGCCV-vN-7e7GF_izTmfJNGDqXeyZiPosfUU_WFvbTNA6kd5N5gGaBnYOgA.',
'__Secure-1PSID': 'Ewh6Ijl9oQFrGCCV-vN-7e7GF_izTmfJNGDqXeyZiPosfUU_f8Jga7Ol8NIWJXhyGOEk-A.',
'__Secure-3PSID': 'Ewh6Ijl9oQFrGCCV-vN-7e7GF_izTmfJNGDqXeyZiPosfUU_kHGD3vOxcFGqHQBL8O8B8g.',
'SEARCH_SAMESITE': 'CgQIoZQB',
'NID': '511=QiLQyPQ5crE9fNmgUujFWO-9JQkH2JeMUGBB3_mIt5s-9Fg_70xH3e_-3rUfC_3Ko7Lsxqs_TvE275Zw0s5jdQ6tbI7ZjtEomcM00DZurWJ62U1NIK6IfnPBAaTzbGlV2KodpEiqBuJcaR1DO7FMROLvGtkjQdrSlZNKVgBp698U3Nt6Pzjomjug7cYPAg1xCwKYlpURvvvXAfnMfde-p6QWgxdfEPGGyY4ZYNhcv6FA4EpnvaLh171sfqI4MjhGJpm0g7UNBiKG8cFXpbJzRzkinU3xy8aAt7I2OVMUZS6LFqZhHqXDJoJ4t8UuQ0DxXwvUJHCbKZ5uSyyVoIVHexQ4o8BBhKicfDy8sMe3bF89HhJ5INLYnJC-LuHAhO9sJ11p_uC_dZGWidHoZWaYGj1AIWVbdXN6wjrWmaihbb-VVbnhNmE_G7rV8BeDJYxzZQHWI-ymZyq4lz5eRcP-2t4Ax6RnkY4j6gYw2vI',
'1P_JAR': '2021-12-22-05',
'SIDCC': 'AJi4QfEFpxpXOAf7NNkJEhsgiXNIFKVFuHM94iPa3Temnrtkm5zrGky3ksGDHeFmXoO9kiBRqw',
'__Secure-3PSIDCC': 'AJi4QfEJOxGLcAC7l6DbzCRkWToIRTCsZDVYWXqwOkwFFmLi4QngbMrsm5w-CeWhRZgSyHZW_DJf',
}
headers = {
'Host': 'clients5.google.com',
'Sec-Ch-Ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'Sec-Ch-Ua-Mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Sec-Ch-Ua-Platform': '"Windows"',
'Accept': '*/*',
'X-Client-Data': 'CKu1yQEIiLbJAQimtskBCMS2yQEIqZ3KAQie+csBCOeEzAEItYXMAQjLicwBCNKPzAEYjp7LAQ==',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9,id;q=0.8',
}
params = (
('client', 'dict-chrome-ex'),
('tbb', '1'),
('ie', 'UTF-8'),
('oe', 'UTF-8'),
('sl', 'auto'),
('tl', 'id'),
('q', text),
)
response = TL.get('https://clients5.google.com/translate_a/t', headers=headers, params=params, cookies=cookies, verify=False)
try:
if response.json()['src'] == "id":
return Fore.GREEN + "YES"
else:
return Fore.RED + "NOT"
except:
print("\t[+] An error while getting info source language, sleeping for 15 Sec..")
time.sleep(15)
response = TL.get('https://clients5.google.com/translate_a/t', headers=headers, params=params, cookies=cookies, verify=False)
if response.json()['src'] == "id":
return Fore.GREEN + "YES"
else:
return Fore.RED + "NOT"
def getpage(url):
return requests.get(url, verify=False).text
def extract_url(content):
u = {}
current_page = re.search(r"Page\s(\d+)\sof\s\d+\s\|\sTotal Records", content)[1]
journal_name = re.findall(r"<dl class=\"uk-description-list-line\">\n\s+.*\n\s+.*class=\"paper-link\"\shref=\"(.*?)\".*>(.*?)<\/a>\n\s+.*\n\s+<dd>(.*?)<\/dd>\s+.*\n(.*?)<\/dd>", content)
for i, k in enumerate(journal_name):
check = checkFromIndonesia(str(k[1]).strip())
print("\t[+] ["+ Fore.BLUE + str(k[1]) + Style.RESET_ALL +"] [ "+ Fore.YELLOW +"From Indonesia? : " + check + Style.RESET_ALL + " ]")
u.update(
{
i: {
'journal_url': k[0],
'journal_name': k[1],
'journal_by': k[2],
'journal_index': k[3].strip(),
'journal_from_indo': True if re.search(r"YES", check) else False
}
}
)
if(re.search("YES", check)):
s = open("journal_from_ind.txt", "a+")
s.write("Journal Name : {}\nJournal URL : {}\nJournal Author : {}\n\n".format(str(k[1]), str(k[0]), str(k[2])))
s.close()
# time.sleep(1.5)
def get_total_page(content):
return int(re.search(r"of (\d+) | Total Records : \d+", content)[1])
def get_journal_index_name(content):
return re.search(r"<div class=\"au-name\">(.*?)<\/div>", content)[1]
url = input("Main URL page 1 : ")
print("[+] Getting journal info....\n")
time.sleep(2)
getInfo = getpage(url)
print("[+] Journal Index Name : " + str(get_journal_index_name(getInfo)))
print("[+] Total Page : " + str(get_total_page(getInfo)))
print("[+] Rebuilding information for crawling mode...\n")
time.sleep(2)
journal = {i:{} for i in range(1, get_total_page(getInfo) + 1)}
for x in range(1, get_total_page(getInfo) + 1):
print("[~] Crawling page " + str(x) + "...")
new_url = re.sub("page=\d+&", "page="+ str(x) + "&", url)
getInfo = getpage(new_url)
extract_url(getInfo)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment